# Movie Recommendation System #

## Step 1: Data Preprocessing

## Load Libraries ##

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import seaborn as sns
import gdown
import ast
from pandas.api.types import is_list_like

## Load Data ##

In [None]:
#### MOVIES DATA #####
metadata_drive = '1qaNjoejMQQO_OhUzAl2TnigXC5V-jUcb'
gdown.download(f"https://drive.google.com/uc?id={metadata_drive}", "movies_metadata.csv", quiet=False)
df_movies_metadata = pd.read_csv("movies_metadata.csv")


##### LOAD RATINGS DATA ####
ratings_small_drive = '1eyoi919yb7foMd78kyzewW5IfjkiU85Q'
gdown.download(f"https://drive.google.com/uc?id={ratings_small_drive}", "ratings_small.csv", quiet=False)
df_ratings = pd.read_csv("ratings_small.csv")

###### LOAD KEYWORDS DATA ######
KEYWORD_ID = '1Vb0LudiP1W0oh3O624AKKENZXywv71hM'
gdown.download(f"https://drive.google.com/uc?id={KEYWORD_ID}", "keywords.csv", quiet=False)
df_keywords = pd.read_csv("keywords.csv")

Downloading...
From: https://drive.google.com/uc?id=1qaNjoejMQQO_OhUzAl2TnigXC5V-jUcb
To: /content/movies_metadata.csv
100%|██████████| 34.4M/34.4M [00:00<00:00, 96.8MB/s]
  df_movies_metadata = pd.read_csv("movies_metadata.csv")
Downloading...
From: https://drive.google.com/uc?id=1eyoi919yb7foMd78kyzewW5IfjkiU85Q
To: /content/ratings_small.csv
100%|██████████| 2.44M/2.44M [00:00<00:00, 57.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Vb0LudiP1W0oh3O624AKKENZXywv71hM
To: /content/keywords.csv
100%|██████████| 6.23M/6.23M [00:00<00:00, 38.3MB/s]


# 1. Data Preparation

In [None]:
#Dropping irrelevant columns
columns_to_drop = ['poster_path', 'homepage', 'video', 'imdb_id','tagline','belongs_to_collection', 'status',
                  'budget', 'title', 'revenue']
df_movie_filtered = df_movies_metadata.drop(columns=columns_to_drop)

important_columns = [
    'original_language', 'popularity', 'release_date',
    'runtime', 'spoken_languages',
    'original_title', 'vote_average', 'vote_count'
]

# Drop rows with missing values in these key columns
df_movie_filtered = df_movie_filtered.dropna(subset=important_columns)

# Optional: Reset index after dropping rows
df_movie_filtered = df_movie_filtered.reset_index(drop=True)

# Ensure proper type conversion
df_movie_filtered['runtime'] = pd.to_numeric(df_movie_filtered['runtime'], errors='coerce')
df_movie_filtered['popularity'] = pd.to_numeric(df_movie_filtered['popularity'], errors='coerce')
df_movie_filtered['vote_average'] = pd.to_numeric(df_movie_filtered['vote_average'], errors='coerce')
df_movie_filtered['vote_count'] = pd.to_numeric(df_movie_filtered['vote_count'], errors='coerce').fillna(0).astype(int)
df_movie_filtered['release_date'] = pd.to_datetime(df_movie_filtered['release_date'], errors='coerce')


df_movie_filtered.head()

Unnamed: 0,adult,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,vote_average,vote_count
0,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",7.7,5415
1,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",6.9,2413
2,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",6.5,92
3,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",6.1,34
4,False,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",5.7,173


### Extract JSON to String ##
#### Genres, Production_Companies, Production Countries, Spoken_Languages to convert from JSON to Text
In this section, we transform several columns in our movie metadata DataFrame that are stored in JSON-like format into plain text. This is important because many columns—such as **genres**, **production countries**, and **spoken languages**—are originally stored as JSON strings or objects. For our analysis and feature engineering, we want these values in a simple, human-readable format

In [None]:
def extract_genres_as_string(entry):
    try:
        items = ast.literal_eval(entry) if isinstance(entry, str) else entry
        genres = [d['name'] for d in items if isinstance(d, dict) and 'name' in d]
        return ', '.join(genres)  # Convert list to comma-separated string
    except (ValueError, SyntaxError):
        return ''

# Extracting genres into a list
df_movie_filtered['genres'] = df_movie_filtered['genres'].apply(extract_genres_as_string)

def extract_names_as_string(entry):
    try:
        items = ast.literal_eval(entry) if isinstance(entry, str) else entry
        names = [d['name'] for d in items if isinstance(d, dict) and 'name' in d]
        return ', '.join(names)
    except (ValueError, SyntaxError):
        return ''

# Columns to convert from JSON-like to flat text
columns_to_flatten = ['production_countries', 'spoken_languages', 'production_companies']

for col in columns_to_flatten:
    df_movie_filtered[col] = df_movie_filtered[col].apply(extract_names_as_string)

### Converting Release Date to Release Year

We extract the **release year** from the `release_date` column and store it in a new column called `release_year`.

In [None]:
#adding a new column from release_date that represents the year the movie was released
df_movie_filtered['release_year'] = df_movie_filtered['release_date'].dt.year
df_movie_filtered.drop(columns=['release_date'], inplace=True)

df_movie_filtered.head()

Unnamed: 0,adult,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,runtime,spoken_languages,vote_average,vote_count,release_year
0,False,"Animation, Comedy, Family",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,United States of America,81.0,English,7.7,5415,1995
1,False,"Adventure, Fantasy, Family",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,104.0,"English, Français",6.9,2413,1995
2,False,"Romance, Comedy",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"Warner Bros., Lancaster Gate",United States of America,101.0,English,6.5,92,1995
3,False,"Comedy, Drama, Romance",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,Twentieth Century Fox Film Corporation,United States of America,127.0,English,6.1,34,1995
4,False,Comedy,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"Sandollar Productions, Touchstone Pictures",United States of America,106.0,English,5.7,173,1995


## 1.2 Feature Engineering for Movies Meta Data

## 1.2.1 Binning for RunTime (in Minutes)
The **runtime** of each movie is categorized into three bins: `Short` (less than 60 minutes), `Medium` (60 to 120 minutes), and `Long` (more than 120 minutes). Movies with missing runtime values are labeled as `Unknown`. The original `runtime` column is then dropped.


In [None]:
def bin_runtime(runtime):
    if pd.isna(runtime):
        return 'Unknown'
    elif runtime < 60:
        return 'Short'
    elif runtime <= 120:
        return 'Medium'
    else:
        return 'Long'

df_movie_filtered['runtime_category'] = df_movie_filtered['runtime'].apply(bin_runtime)
df_movie_filtered.drop(columns=['runtime'], inplace=True)

## 1.2.2 Normalising Continuous Variables
The continuous variables are processed as follows:
1. **Vote Count:** Applied a logarithmic transformation to reduce the effect of large values.
2. **Normalization:** Scaled the transformed `vote_count`, `vote_average`, and `popularity` to the range [0, 1] using MinMaxScaler

In [None]:
# Log-transform vote_count
df_movie_filtered['vote_count_log'] = np.log1p(df_movie_filtered['vote_count'])

# Initialize scaler
scaler = MinMaxScaler()

# Normalize vote_average and vote_count_log and popularity
df_movie_filtered[['vote_average_norm', 'vote_count_norm', 'popularity_norm']] = scaler.fit_transform(
    df_movie_filtered[['vote_average', 'vote_count_log', 'popularity']]
)
df_movie_filtered.drop(columns=['vote_count','vote_average', 'popularity'], inplace=True)

df_movie_filtered.head()

Unnamed: 0,adult,genres,id,original_language,original_title,overview,production_companies,production_countries,spoken_languages,release_year,runtime_category,vote_count_log,vote_average_norm,vote_count_norm,popularity_norm
0,False,"Animation, Comedy, Family",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,United States of America,English,1995,Medium,8.597113,0.77,0.900011,0.040087
1,False,"Adventure, Fantasy, Family",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,"English, Français",1995,Medium,7.78904,0.69,0.815416,0.031079
2,False,"Romance, Comedy",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,"Warner Bros., Lancaster Gate",United States of America,English,1995,Medium,4.532599,0.65,0.474507,0.021394
3,False,"Comedy, Drama, Romance",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Twentieth Century Fox Film Corporation,United States of America,English,1995,Long,3.555348,0.61,0.372201,0.007049
4,False,Comedy,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,"Sandollar Productions, Touchstone Pictures",United States of America,English,1995,Medium,5.159055,0.57,0.540089,0.01532


## 1.2.3 Years Since Release

In [None]:
df_movie_filtered['years_since_release'] = 2017 - df_movie_filtered['release_year']

## 1.3 Preprocessing of Keywords Dataset ##

1. **Filter Out Duplicates:** Removed duplicate rows to ensure data integrity.
2. **Convert Keywords to List:** Converted the `keywords` feature into a list format.
3. **Extract Keyword Values:** Created a new column `keyword_values` by extracting the `name` attribute from each dictionary entry within the list.

In [None]:
### Filter out duplicates
df_unique_keywords = df_keywords.drop_duplicates()

'''
Convert keywords feature in keywords.csv to a list. Then, create a new feature keyword_values which extracts the name attribute of each dictionary
stored to form a list of keywords for the corresponding movie
'''
df_unique_keywords["keywords"] = df_unique_keywords["keywords"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [])
df_unique_keywords["keyword_values"] = df_unique_keywords["keywords"].apply(lambda lst: [d["name"] for d in lst] if isinstance(lst, list) else [])

df_unique_keywords.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique_keywords["keywords"] = df_unique_keywords["keywords"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique_keywords["keyword_values"] = df_unique_keywords["keywords"].apply(lambda lst: [d["name"] for d in lst] if isinstance(lst, list) else [])


Unnamed: 0,id,keywords,keyword_values
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[board game, disappearance, based on children'..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[based on novel, interracial relationship, sin..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[baby, midlife crisis, confidence, aging, daug..."


## 1.4 Merging Keywords and Movies Meta Dataset

## 1.4.1 Merge Keywords and Movies ##

### Merging Keywords and Movies

The keywords and movies datasets are merged to create a unified dataset. The process involves:

1. Performing a left join on the `id` column from both datasets.
2. **Handling Missing Keywords:**
   - If no keywords are available, the `genres` are used as a fallback.
   - Any empty keyword values are replaced with the string `"No specific themes"`.

This merged dataset now contains both movie metadata and associated keywords, making it comprehensive for further analysis.

In [None]:
df_movie_filtered['id'] = df_movie_filtered['id'].astype(int)
df_movies_and_keywords = df_movie_filtered.merge(df_unique_keywords, on = 'id', how = 'left')
df_movies_and_keywords['keyword_values'] = df_movies_and_keywords['keyword_values'].apply(lambda x: x if is_list_like(x) else [])

# Fill with genres if there are no keywords
df_movies_and_keywords['keyword_values'] = df_movies_and_keywords.apply(lambda x: x['genres'].split(',') if len(x['keyword_values']) == 0 else x['keyword_values'], axis = 1)

# Ensure keyword_values is a list before joining
df_movies_and_keywords['keyword_values'] = df_movies_and_keywords['keyword_values'].apply(
    lambda x: ', '.join(x)
)
# Fill empty string values with "No specific themes"
df_movies_and_keywords['keyword_values'] = df_movies_and_keywords['keyword_values'].apply(lambda x: 'No specific themes' if len(x) == 0 else x)

df_movies_and_keywords.drop(columns = ['keywords'], inplace = True)

display(df_movies_and_keywords)

Unnamed: 0,adult,genres,id,original_language,original_title,overview,production_companies,production_countries,spoken_languages,release_year,runtime_category,vote_count_log,vote_average_norm,vote_count_norm,popularity_norm,years_since_release,keyword_values
0,False,"Animation, Comedy, Family",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Pixar Animation Studios,United States of America,English,1995,Medium,8.597113,0.77,0.900011,0.040087,22,"jealousy, toy, boy, friendship, friends, rival..."
1,False,"Adventure, Fantasy, Family",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,"English, Français",1995,Medium,7.789040,0.69,0.815416,0.031079,22,"board game, disappearance, based on children's..."
2,False,"Romance, Comedy",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,"Warner Bros., Lancaster Gate",United States of America,English,1995,Medium,4.532599,0.65,0.474507,0.021394,22,"fishing, best friend, duringcreditsstinger, ol..."
3,False,"Comedy, Drama, Romance",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Twentieth Century Fox Film Corporation,United States of America,English,1995,Long,3.555348,0.61,0.372201,0.007049,22,"based on novel, interracial relationship, sing..."
4,False,Comedy,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,"Sandollar Productions, Touchstone Pictures",United States of America,English,1995,Medium,5.159055,0.57,0.540089,0.015320,22,"baby, midlife crisis, confidence, aging, daugh..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45114,False,"Drama, Action, Romance",30840,en,Robin Hood,"Yet another version of the classic epic, with ...","Westdeutscher Rundfunk (WDR), Working Title Fi...","Canada, Germany, United Kingdom, United States...",English,1991,Medium,3.295837,0.57,0.345033,0.010382,26,"Drama, Action, Romance"
45115,False,Drama,111109,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,Sine Olivia,Philippines,,2011,Long,1.386294,0.90,0.145128,0.000326,6,"artist, play, pinoy"
45116,False,"Action, Drama, Thriller",67758,en,Betrayal,"When one of her hits goes wrong, a professiona...",American World Pictures,United States of America,English,2003,Medium,1.945910,0.38,0.203713,0.001649,14,"Action, Drama, Thriller"
45117,False,,227506,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",Yermoliev,Russia,,1917,Medium,0.000000,0.00,0.000000,0.000006,100,No specific themes


## 1.4.2 Create Textual Representation of all Text Variables ##

We generate a consolidated textual description for each movie by combining key text features (title, overview, countries, languages, genres, keywords). Missing values are replaced with placeholders, and the result is stored in the `textual_representation` column.

In [None]:
# Ensure all required columns exist and fill empty strings
df_movies_and_keywords['original_title'] = df_movies_and_keywords['original_title'].apply(lambda x: 'Unknown title' if x == '' else x)
df_movies_and_keywords['overview'] = df_movies_and_keywords['overview'].apply(lambda x: 'No description available.' if x == '' else x)
df_movies_and_keywords['production_countries'] = df_movies_and_keywords['production_countries'].apply(lambda x: 'Unknown country' if x == '' else x)
df_movies_and_keywords['spoken_languages'] = df_movies_and_keywords['spoken_languages'].apply(lambda x: 'Unknown language' if x == '' else x)
df_movies_and_keywords['genres'] = df_movies_and_keywords['genres'].apply(lambda x: 'Unknown genre' if x == '' else x)

# Create a combined textual column
df_movies_and_keywords['textual_representation'] = df_movies_and_keywords.apply(
    lambda x: f"This movie is titled {x['original_title']}, produced in {x['production_countries']} with {x['spoken_languages']} as the main language. It falls under the genres of {x['genres']} and explores themes of {x['keyword_values']}. Overview: {x['overview']}",
    axis=1
)

# Display a sample
display(df_movies_and_keywords.iloc[0]['textual_representation'])

"This movie is titled Toy Story, produced in United States of America with English as the main language. It falls under the genres of Animation, Comedy, Family and explores themes of jealousy, toy, boy, friendship, friends, rivalry, boy next door, new toy, toy comes to life. Overview: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

## 1.5 Preprocessing of Ratings Dataset ##

### Filter out Invalid Ratings (No corresponding movieid in movies meta data) ##

In [None]:
df_ratings_filtered = df_ratings[df_ratings['movieId'].isin(df_movies_and_keywords['id'])].copy()
df_ratings_filtered

Unnamed: 0,userId,movieId,rating,timestamp
10,1,1371,2.5,1260759135
11,1,1405,1.0,1260759203
13,1,2105,4.0,1260759139
15,1,2193,2.0,1260759198
16,1,2294,2.0,1260759108
...,...,...,...,...
99983,671,4995,4.0,1064891537
99992,671,5816,4.0,1065111963
99993,671,5902,3.5,1064245507
99996,671,5991,4.5,1064245387


### Converting Timestamp to DataTime ###

In [None]:
# Create new Column "Date_time" for converted timestamp to datetime
df_ratings_filtered.loc[:, 'date_time'] = pd.to_datetime(df_ratings_filtered['timestamp'], unit='s')
df_ratings_filtered.drop(columns = ['timestamp'], inplace = True)
display(df_ratings_filtered)

Unnamed: 0,userId,movieId,rating,date_time
10,1,1371,2.5,2009-12-14 02:52:15
11,1,1405,1.0,2009-12-14 02:53:23
13,1,2105,4.0,2009-12-14 02:52:19
15,1,2193,2.0,2009-12-14 02:53:18
16,1,2294,2.0,2009-12-14 02:51:48
...,...,...,...,...
99983,671,4995,4.0,2003-09-30 03:12:17
99992,671,5816,4.0,2003-10-02 16:26:03
99993,671,5902,3.5,2003-09-22 15:45:07
99996,671,5991,4.5,2003-09-22 15:43:07


## 1.6 Export Dataset ##

In [None]:
df_ratings_filtered.to_csv("processed_ratings.csv", index = False)

df_movies_and_keywords.to_csv("processed_movies_and_keywords.csv", index = False)

In [None]:
from google.colab import files
files.download("processed_ratings.csv")
files.download("processed_movies_and_keywords.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>