In [21]:
import pandas as pd

In [22]:
# Load MovieLens datasets

movies = pd.read_csv("D:/movie_recommendation_system/recommend_model/data/raw/movies.csv")       
ratings = pd.read_csv("D:/movie_recommendation_system/recommend_model/data/raw/ratings.csv")      
links = pd.read_csv("D:/movie_recommendation_system/recommend_model/data/raw/links.csv")         


In [23]:
print(movies.head())
print(ratings.head())
print(links.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [24]:
# Merge MovieLens movies with links to get tmdbId
movies_links = pd.merge(movies, links, on="movieId", how="left")

# Check result
print(movies_links.head())
print(movies_links.columns)
print(movies_links.info())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  imdbId   tmdbId  
0  Adventure|Animation|Children|Comedy|Fantasy  114709    862.0  
1                   Adventure|Children|Fantasy  113497   8844.0  
2                               Comedy|Romance  113228  15602.0  
3                         Comedy|Drama|Romance  114885  31357.0  
4                                       Comedy  113041  11862.0  
Index(['movieId', 'title', 'genres', 'imdbId', 'tmdbId'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   titl

In [25]:
# Load TMDB dataset
tmdb = pd.read_csv("D:/movie_recommendation_system/recommend_model/data/raw/tmdb_5000_movies.csv")  # id, title, genres, overview, release_date, vote_average, vote_count, etc.

# Merge MovieLens + Links with TMDB on tmdbId
merged_df = pd.merge(movies_links, tmdb, left_on="tmdbId", right_on="id", how="inner")

print(merged_df.head())
print(merged_df.columns)
print(merged_df.info())


   movieId                         title_x  \
0        1                Toy Story (1995)   
1       10                GoldenEye (1995)   
2       11  American President, The (1995)   
3       14                    Nixon (1995)   
4       15         Cutthroat Island (1995)   

                                      genres_x  imdbId   tmdbId    budget  \
0  Adventure|Animation|Children|Comedy|Fantasy  114709    862.0  30000000   
1                    Action|Adventure|Thriller  113189    710.0  58000000   
2                         Comedy|Drama|Romance  112346   9087.0  62000000   
3                                        Drama  113987  10858.0  44000000   
4                     Action|Adventure|Romance  112760   1408.0  98000000   

                                            genres_y  \
0  [{"id": 16, "name": "Animation"}, {"id": 35, "...   
1  [{"id": 12, "name": "Adventure"}, {"id": 28, "...   
2  [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...   
3  [{"id": 36, "name": "History"}, {

In [26]:
# Rename MovieLens columns
merged_df = merged_df.rename(columns={
    "title_x": "title",
    "genres_x": "genres"
})

# Drop redundant or unused columns for now
merged_df = merged_df.drop(columns=[
    "title_y", "id", "homepage", "tagline", "keywords",
    "production_companies", "production_countries", "spoken_languages", "status"
])

print(merged_df.columns)


Index(['movieId', 'title', 'genres', 'imdbId', 'tmdbId', 'budget', 'genres_y',
       'original_language', 'original_title', 'overview', 'popularity',
       'release_date', 'revenue', 'runtime', 'vote_average', 'vote_count'],
      dtype='object')


In [27]:
# Fill missing runtime with median
merged_df["runtime"] = merged_df["runtime"].fillna(merged_df["runtime"].median())

# Fill overview missing with empty string
merged_df["overview"] = merged_df["overview"].fillna("")

# Fill genres_y (JSON-like) with empty list for missing values
import ast
merged_df["genres_y"] = merged_df["genres_y"].apply(
    lambda x: [d["name"] for d in ast.literal_eval(x)] if pd.notna(x) else []
)


In [28]:
print(merged_df.info())
print(merged_df.head(3))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3537 entries, 0 to 3536
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movieId            3537 non-null   int64  
 1   title              3537 non-null   object 
 2   genres             3537 non-null   object 
 3   imdbId             3537 non-null   int64  
 4   tmdbId             3537 non-null   float64
 5   budget             3537 non-null   int64  
 6   genres_y           3537 non-null   object 
 7   original_language  3537 non-null   object 
 8   original_title     3537 non-null   object 
 9   overview           3537 non-null   object 
 10  popularity         3537 non-null   float64
 11  release_date       3537 non-null   object 
 12  revenue            3537 non-null   int64  
 13  runtime            3537 non-null   float64
 14  vote_average       3537 non-null   float64
 15  vote_count         3537 non-null   int64  
dtypes: float64(4), int64(5),

In [29]:
import os

# Create processed folder if not exists
os.makedirs("D:/movie_recommendation_system/rcmndn_model/data/processed", exist_ok=True)

# Save merged & cleaned dataset
processed_path = "D:/movie_recommendation_system/rcmndn_model/data/processed/movies_cleaned.csv"
merged_df.to_csv(processed_path, index=False)

print(f"Cleaned dataset saved at: {processed_path}")


Cleaned dataset saved at: D:/movie_recommendation_system/rcmndn_model/data/processed/movies_cleaned.csv


In [30]:
import pandas as pd

# Load the already cleaned dataset
movies_cleaned = pd.read_csv("D:/movie_recommendation_system/rcmndn_model/data/processed/movies_cleaned.csv")

print(movies_cleaned.info())
print(movies_cleaned.head(3))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3537 entries, 0 to 3536
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movieId            3537 non-null   int64  
 1   title              3537 non-null   object 
 2   genres             3537 non-null   object 
 3   imdbId             3537 non-null   int64  
 4   tmdbId             3537 non-null   float64
 5   budget             3537 non-null   int64  
 6   genres_y           3537 non-null   object 
 7   original_language  3537 non-null   object 
 8   original_title     3537 non-null   object 
 9   overview           3537 non-null   object 
 10  popularity         3537 non-null   float64
 11  release_date       3537 non-null   object 
 12  revenue            3537 non-null   int64  
 13  runtime            3537 non-null   float64
 14  vote_average       3537 non-null   float64
 15  vote_count         3537 non-null   int64  
dtypes: float64(4), int64(5),

In [31]:
import pandas as pd
import os
from datetime import datetime

# Paths
raw_ratings_path = "D:/movie_recommendation_system/recommend_model/data/raw/ratings.csv"
movies_cleaned_path = "D:/movie_recommendation_system/recommend_model/data/processed/movies_cleaned.csv"
processed_path = "D:/movie_recommendation_system/recommend_model/data/processed/ratings_final.csv"

# Load raw ratings
ratings = pd.read_csv(raw_ratings_path)
print("Before cleaning:")
print(ratings.info())
print(ratings.head(3))

# --- Cleaning ---
# 1. Drop duplicates
ratings = ratings.drop_duplicates()

# 2. Drop missing values if any
ratings = ratings.dropna()

# 3. Convert timestamp → datetime
ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')


# 4. Keep only needed columns (but keep datetime)
ratings = ratings[['userId', 'movieId', 'rating', 'timestamp', 'datetime']]


# --- Alignment with movies_cleaned ---
movies = pd.read_csv(movies_cleaned_path)
common_movie_ids = set(ratings['movieId']).intersection(set(movies['movieId']))
before_align = len(ratings)
ratings = ratings[ratings['movieId'].isin(common_movie_ids)]
after_align = len(ratings)
print(f" Ratings aligned: {before_align} → {after_align} entries kept ({before_align - after_align} dropped).")

# --- Remove rare users/movies ---
# Drop users with < 5 ratings
before_users = ratings['userId'].nunique()
user_counts = ratings['userId'].value_counts()
ratings = ratings[ratings['userId'].isin(user_counts[user_counts >= 5].index)]
after_users = ratings['userId'].nunique()
print(f" Users filtered: {before_users} → {after_users} unique users kept ({before_users - after_users} dropped).")

# Drop movies with < 5 ratings
before_movies = ratings['movieId'].nunique()
movie_counts = ratings['movieId'].value_counts()
ratings = ratings[ratings['movieId'].isin(movie_counts[movie_counts >= 5].index)]
after_movies = ratings['movieId'].nunique()
print(f" Movies filtered: {before_movies} → {after_movies} unique movies kept ({before_movies - after_movies} dropped).")

# --- Save final cleaned ratings ---
os.makedirs(os.path.dirname(processed_path), exist_ok=True)
ratings.to_csv(processed_path, index=False)

print(f"\n Final ratings saved at: {processed_path}")
print(ratings.info())
print(ratings.head(3))


Before cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
 Ratings aligned: 100836 → 70194 entries kept (30642 dropped).
 Users filtered: 610 → 610 unique users kept (0 dropped).
 Movies filtered: 3536 → 2187 unique movies kept (1349 dropped).

 Final ratings saved at: D:/movie_recommendation_system/recommend_model/data/processed/ratings_final.csv
<class 'pandas.core.frame.DataFrame'>
Index: 67275 entries, 0 to 100819
Data columns (total 5 columns):
 #   Column     Non-Null

In [32]:

# Convert timestamp column in ratings
ratings["timestamp"] = ratings["timestamp"].apply(
   lambda x: datetime.fromtimestamp(int(x)).strftime("%Y-%m-%d %H:%M:%S") if pd.notna(x) else None
)
# Save fixed ratings dataset
ratings.to_csv("D:/movie_recommendation_system/recommend_model/data/processed/ratings_final_fixed.csv", index=False)
print(" Ratings dataset saved with fixed timestamps")

 Ratings dataset saved with fixed timestamps


In [3]:
import os
import time
import pandas as pd
import requests
from dotenv import load_dotenv
from tqdm import tqdm

# Load TMDB API key
load_dotenv()
API_KEY = os.getenv("TMDB_API_KEY")

if not API_KEY:
    raise ValueError("TMDB_API_KEY not found. Please add it in .env file.")

BASE_URL = "https://api.themoviedb.org/3/movie/{}?api_key={}"
movies_df = pd.read_csv("D:/movie_recommendation_system/recommend_model/data/processed/movies_cleaned.csv")
poster_urls = []

for idx, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
    tmdb_id = row.get("tmdbId")

    if pd.isna(tmdb_id):
        poster_urls.append("https://via.placeholder.com/500x750?text=No+Image")
        continue

    url = BASE_URL.format(int(tmdb_id), API_KEY)

    try:
        response = requests.get(url).json()
        poster_path = response.get("poster_path")

        if poster_path:
            poster_urls.append(f"https://image.tmdb.org/t/p/w500{poster_path}")

        else:
            poster_urls.append("https://via.placeholder.com/500x750?text=No+Image")

    except Exception:

        poster_urls.append("https://via.placeholder.com/500x750?text=No+Image")

    time.sleep(0.25)

movies_df["poster_path"] = poster_urls
movies_df.to_csv("D:/movie_recommendation_system/recommend_model/data/processed/movies_final.csv", index=False)

print("Posters fetched and saved in movies_final.csv")
 

100%|██████████| 3537/3537 [1:50:55<00:00,  1.88s/it]  


Posters fetched and saved in movies_final.csv


In [None]:
import pandas as pd

movies_final = pd.read_csv(r"D:\movie_recommendation_system\recommend_model\data\processed\movies_final.csv")
movies_posters = pd.read_csv(r"D:\movie_recommendation_system\recommend_model\data\processed\movies_with_posters.csv")

# Merge on movieId directly
merged = movies_posters.merge(
    movies_final[["movieId"]], 
    on="movieId", 
    how="left"
)

print("Before merge:", movies_posters.shape)
print("After merge:", merged.shape)
print("Missing movieId count:", merged['movieId'].isna().sum())

# Save new dataset
merged.to_csv(r"D:\movie_recommendation_system\recommend_model\data\processed\movies_with_posters_withId.csv", index=False)
print(" Saved movies_with_posters_withId.csv with movieId column aligned")


Before merge: (3537, 17)
After merge: (3537, 17)
Missing movieId count: 0
✅ Saved movies_with_posters_withId.csv with movieId column aligned
