Dataset Rating: https://grouplens.org/datasets/movielens/32m/ <br>
[TMDB API](https://www.themoviedb.org/) Data movie: https://developer.themoviedb.org/docs/

In [1]:
import pandas as pd
import requests
import time
from IPython.display import clear_output

# Import data

In [2]:
movies = pd.read_csv('./ml-32m/movies.csv')
ratings = pd.read_csv('./ml-32m/ratings.csv')
links = pd.read_csv('./ml-32m/links.csv')

# Merge data in MovieLens (movies, ratings, links)

In [3]:
df = pd.merge(ratings, movies, on='movieId')
df = pd.merge(df, links, on='movieId')

In [4]:
# Drop unnecessary columns
df.drop(columns=['genres'], inplace=True)
df.drop(columns=['imdbId'], inplace=True)
df = df.reset_index(drop=True)
df

Unnamed: 0,userId,movieId,rating,timestamp,title,tmdbId
0,1,17,4.0,944249077,Sense and Sensibility (1995),4584.0
1,1,25,1.0,944250228,Leaving Las Vegas (1995),451.0
2,1,29,2.0,943230976,"City of Lost Children, The (Cité des enfants p...",902.0
3,1,30,5.0,944249077,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,37557.0
4,1,32,5.0,943228858,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),63.0
...,...,...,...,...,...,...
32000199,200948,79702,4.5,1294412589,Scott Pilgrim vs. the World (2010),22538.0
32000200,200948,79796,1.0,1287216292,Centurion (2010),23759.0
32000201,200948,80350,0.5,1294412671,Vampires Suck (2010),40264.0
32000202,200948,80463,3.5,1350423800,"Social Network, The (2010)",37799.0


# Filter data (year_published >= 2022)

In [5]:
df['year_published'] = df['title'].str.extract(r'\((\d{4})\)', expand=False)
df['year_published'] = df['year_published'].fillna(0).astype(int)
df.drop(columns=['title'], inplace=True)

df = df[df['year_published'] > 2022]
df

Unnamed: 0,userId,movieId,rating,timestamp,tmdbId,year_published
5369,28,285593,3.0,1695439205,447365.0,2023
5370,28,286897,5.0,1692916119,569094.0,2023
5371,28,287699,5.0,1692916114,872585.0,2023
5372,28,288513,5.0,1693533442,346698.0,2023
5373,28,291485,4.0,1696391189,1059811.0,2023
...,...,...,...,...,...,...
31976553,200771,286897,4.0,1687535911,569094.0,2023
31976753,200774,287633,2.5,1693098259,747188.0,2023
31976754,200774,288513,3.0,1693098301,346698.0,2023
31983111,200822,286905,3.0,1688557693,335977.0,2023


In [6]:
df.drop(columns=['year_published'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['year_published'], inplace=True)


# Filter data (timestamp >= 2023-09-02)

In [7]:
temp = df

In [8]:
df = temp
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

df = df[df['timestamp'] > '2023-09-02']
df['timestamp'] = df['timestamp'].astype('int64') // 10**9
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp'] = df['timestamp'].astype('int64') // 10**9


Unnamed: 0,userId,movieId,rating,timestamp,tmdbId
5369,28,285593,3.0,1695439205,447365.0
5373,28,291485,4.0,1696391189,1059811.0
42377,265,284565,4.5,1696137939,758769.0
42393,265,288265,3.0,1695502130,575264.0
42394,265,288279,4.0,1696137867,709631.0
...,...,...,...,...,...
31915948,200384,285593,4.0,1696705707,447365.0
31915949,200384,287699,5.0,1696533806,872585.0
31915950,200384,288265,4.0,1696533909,575264.0
31923236,200443,287639,2.5,1695648354,1098110.0


In [9]:
print(f'Min time: {df['timestamp'].min()}')
print(f'Max time: {df['timestamp'].max()}')

print(f'Unique movies: {df["movieId"].nunique()}')
print(f'Unique users: {df["userId"].nunique()}')
print(f'Number of ratings: {len(df)}')

Min time: 1693613772
Max time: 1697163675
Unique movies: 602
Unique users: 1714
Number of ratings: 5350


In [10]:
df['tmdbId'] = df['tmdbId'].fillna(0).astype(int)
df['userId'] = 'U' + df['userId'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tmdbId'] = df['tmdbId'].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['userId'] = 'U' + df['userId'].astype(str)


# Get data from TMDB API to build website recommendation </br>
`TMDB_API_KEY nằm trong file .ENV`, để có API_KEY, bạn cần đăng ký tài khoản trên trang chủ của TMDB

In [11]:
from dotenv import dotenv_values

config = dotenv_values(".env")
TMDB_API_KEY = config.get("TMDB_API_KEY")

In [12]:
def get_movie_details(movie_id):
    API_KEY = TMDB_API_KEY
    BASE_URL = 'https://api.themoviedb.org/3'
    DETAILS_ENDPOINT = f'{BASE_URL}/movie/{{movie_id}}'
    response = requests.get(DETAILS_ENDPOINT.format(movie_id=movie_id), params={'api_key': API_KEY})

    if response.status_code == 200:
        data = response.json()
        return {
            "tmdbId": data.get("id"),
            "title": data.get("title"),
            "poster": f"https://image.tmdb.org/t/p/w500{data.get('poster_path')}",
            "date_published": data.get("release_date"),
        }
    else:
        None

####################

tmdb_movie = []
tmdbid_list = df['tmdbId'].unique()
counter = 0

for tmdbid in tmdbid_list:
    movie = get_movie_details(tmdbid)
    if movie:
        tmdb_movie.append(movie)
        counter += 1  
    
    if counter % 50 == 0:
        print(f"{counter}/{len(tmdbid_list)} movies processed...")
    # Add a delay to handle API rate limits
    time.sleep(0.25) 

clear_output(wait=False)
print(f"Processed {counter}/{len(tmdbid_list)} movies.")

tmdb_movie = pd.DataFrame(tmdb_movie)

Processed 599/602 movies.


In [13]:
df = pd.merge(df, tmdb_movie, on='tmdbId')
df.drop(columns=['movieId'], inplace=True)
df = df.dropna()

df.to_csv('Dataset.csv', index=False)

df

Unnamed: 0,userId,rating,timestamp,tmdbId,title,poster,date_published
0,U28,3.0,1695439205,447365,Guardians of the Galaxy Vol. 3,https://image.tmdb.org/t/p/w500/r2J02Z2OpNTctf...,2023-05-03
1,U28,4.0,1696391189,1059811,Flora and Son,https://image.tmdb.org/t/p/w500/pq2p8ovf8PZps2...,2023-09-21
2,U265,4.5,1696137939,758769,Unwelcome,https://image.tmdb.org/t/p/w500/88bGObx8YKLQDO...,2023-01-27
3,U265,3.0,1695502130,575264,Mission: Impossible - Dead Reckoning Part One,https://image.tmdb.org/t/p/w500/NNxYkU70HPurnN...,2023-07-08
4,U265,4.0,1696137867,709631,Cobweb,https://image.tmdb.org/t/p/w500/2bHCUqQp8CoSVv...,2023-07-19
...,...,...,...,...,...,...,...
5340,U200384,4.0,1696705707,447365,Guardians of the Galaxy Vol. 3,https://image.tmdb.org/t/p/w500/r2J02Z2OpNTctf...,2023-05-03
5341,U200384,5.0,1696533806,872585,Oppenheimer,https://image.tmdb.org/t/p/w500/8Gxv8gSFCU0XGD...,2023-07-19
5342,U200384,4.0,1696533909,575264,Mission: Impossible - Dead Reckoning Part One,https://image.tmdb.org/t/p/w500/NNxYkU70HPurnN...,2023-07-08
5343,U200443,2.5,1695648354,1098110,Blood & Gold,https://image.tmdb.org/t/p/w500/oLRQP5cEjiT1Dx...,2023-04-21


In [14]:
del movies
del ratings
del links
del df
del tmdb_movie