In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
movies = pd.read_csv("/kaggle/input/movielens-20m-dataset/movie.csv")
movies

In [None]:
titles = movies["title"]
titles.head()

# 1. Content Based Filtering

# Extract release Dates

In [None]:
release_dates = {}
for title in titles:
    release_dates[title] = (title[-5:-1])

list(release_dates.values())[:5]

In [None]:
movies['year'] = movies['title'].str.extract(r'(\d{4})').astype(float)

# Replace Missing Values of year

In [None]:
movies[movies.isna().any(axis =1 )]

In [None]:
# Lets manually add years to these missing values : 
manual_years = {
    "Babylon 5": 1993,
    "Millions Game, The (Das Millionenspiel)": 1970,
    "Bicycle, Spoon, Apple (Bicicleta, cullera, poma)": 2010,
    "Brazil: In the Shadow of the Stadiums": 2014,
    "Slaying the Badger": 2014,
    "Tatort: Im Schmerz geboren": 2014,
    "National Theatre Live: Frankenstein": 2011,
    "The Court-Martial of Jackie Robinson": 1990,
    "In Our Garden": 1981,
    "Stephen Fry In America - New World": 2008,
    "Two: The Story of Roman & Nyro": 2013,
    "Li'l Quinquin": 2014,
    "A Year Along the Abandoned Road": 1991,
    "Body/Cialo": 2015,
    "Polskie gówno": 2014,
    "The Third Reich: The Rise & Fall": 2010,
    "My Own Man": 2014,
    "Moving Alan": 2003,
    "Michael Laudrup - en Fodboldspiller": 2003
}

# fill missing years
movies['year'] = movies.apply(
    lambda row: manual_years.get(row['title'], row['year']),
    axis=1
)
movies[movies.isna().any(axis =1 )]

In [None]:
movies["year"].isna().sum()

In [None]:
movies.head()

# Extract Genres,(split and multi hot)

In [None]:
movies["genres"] = movies["genres"].apply(lambda x: x.split("|") if isinstance(x,str) else [])
movies.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])

# genre names
genre_labels = mlb.classes_
genre_labels

In [None]:
genres_df = pd.DataFrame(genre_matrix, columns=genre_labels, index=movies.index)

# merge back to movies
movies = pd.concat([movies, genres_df], axis=1)

movies.head()

In [None]:
ratings = pd.read_csv("/kaggle/input/movielens-20m-dataset/rating.csv")

In [None]:
ratings

In [None]:
unique_users =  ratings["userId"].nunique()
unique_movies_total = movies["movieId"].nunique()
unique_movies_rated = ratings["movieId"].nunique()

unique_users, unique_movies_total, unique_movies_rated # 

In [None]:
ratings.isnull().sum()

**Get Movie's Avg Rating, Ratings Count(popularity measure)**

In [None]:
# Example to demonstrate grouping

data = {'Category': ['A', 'B', 'A', 'B', 'A'],
            'Value': [10, 20, 15, 25, 12]}
df = pd.DataFrame(data)

grouped_df = df.groupby('Category')['Value'].mean()
print("Grouped DataFrame (with Category as index):\n")
grouped_df

In [None]:
    reset_grouped_df = grouped_df.reset_index()
    print("\nGrouped DataFrame after reset_index():\n", reset_grouped_df)

In [None]:
# Now do the same for ratings of each movie

rating_stats = ratings.groupby("movieId")["rating"].agg(['mean', 'count']).reset_index()

In [None]:
rating_stats.head()

In [None]:
# Rename these Stats and merge back to movies dataset 

rating_stats.rename(columns={'mean': 'avg_rating', 'count': 'rating_count'}, inplace=True)

movies = pd.merge(movies, rating_stats, on='movieId', how='left')
movies.head()

# Handle Tags

In [None]:
tags = pd.read_csv("/kaggle/input/movielens-20m-dataset/tag.csv")
tags.head()

In [None]:
len(tags["movieId"].unique())

In [None]:
tags.dtypes

In [None]:
for tag in tags["tag"].unique():
    if type(tag) != str:
        print(tag)

In [None]:
types

In [None]:
tags_per_movie = tags.groupby("movieId")['tag'].count().reset_index()
tags_per_movie.rename(columns={'tag':'num_tags'}, inplace=True)

# Average number of tags
avg_tags = tags_per_movie['num_tags'].mean()
print("Average tags per movie:", avg_tags)

In [None]:
tags_per_movie

In [None]:
tags_per_movie['num_tags'].plot(kind="hist", bins= 100, range=(0,250))

In [None]:
tags.head()

In [None]:
# Get actual tags Group by movieId ( Hanlde nan  cases as well)

tags['tag'] = tags['tag'].fillna("")
tags['tag'] = tags['tag'].astype(str)

tags['tag'] = tags['tag'].str.lower().str.strip()

movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

In [None]:
movie_tags

In [None]:
# Cross verify 
tags[tags["movieId"]==1]

In [None]:
movies = pd.merge(movies, movie_tags, on='movieId', how='left')

# Fill any remaining NaN (movies with no tags at all)
movies['tag'] = movies['tag'].fillna('')

In [None]:
movies.head()

In [None]:
movies.isna().sum()

**handle avg_rating and rating_count**

In [None]:
# Fill missing avg_rating with global average rating:

global_avg = movies['avg_rating'].mean()
movies['avg_rating'] = movies['avg_rating'].fillna(global_avg)

# Fill missing rating_count. These are movies with no ratings → fill with 0:

movies['rating_count'] = movies['rating_count'].fillna(0)

In [None]:
# Normalize / log-scale for feature fusion

# Ratings count is skewed (some movies have thousands of ratings, others very few).
# To avoid domination in similarity computations:

from sklearn.preprocessing import MinMaxScaler

# log-scale rating count
movies['rating_count_log'] = np.log1p(movies['rating_count'])

# normalize both avg_rating & rating_count_log to [0,1]
scaler = MinMaxScaler()
movies[['avg_rating_norm', 'rating_count_norm']] = scaler.fit_transform(
    movies[['avg_rating', 'rating_count_log']]
)

In [None]:
movies.isna().sum()

In [None]:
movies.head()

In [None]:
movies["tag"].isna().sum()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use max_features to control size, remove English stopwords
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit TF-IDF on movies['tag'] and transform
tag_features = tfidf.fit_transform(movies['tag'])  # sparse matrix
tag_features

**Prepare genres multi-hot vector**

In [None]:
genre_features = movies.loc[:,"Action":"Western"].values
genre_features

In [None]:
rating_features = movies[['avg_rating_norm', 'rating_count_norm']].values  # shape (num_movies,2)
rating_features

# Combine all features

We can horizontally stack genre, tag, and rating features:

In [None]:
from scipy.sparse import hstack # Used for  horizontally stacking sparse arrays

# tag_features is sparse, others are dense → convert dense to sparse for hstack
from scipy.sparse import csr_matrix

genre_sparse = csr_matrix(genre_features)
rating_sparse = csr_matrix(rating_features)

# Combine into one feature matrix
movie_features = hstack([genre_sparse, tag_features, rating_sparse])  # sparse matrix

# Compute similarity

We can now compute cosine similarity between movies:


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity between all movies (may be large, consider sparse or top-K)
similarity_matrix = cosine_similarity(movie_features, dense_output=False)

query_movie = "Jumanji (1995)"


movie_idx = movies[movies['title']== query_movie].index[0] 
# Get the index of query movie

sim_scores = similarity_matrix[movie_idx].toarray().flatten()
# Get similary score with each other movie

top_idx = np.argsort(sim_scores)[::-1][1:11]  # top-10 excluding itself
# Get index (argsort) of movies with with top 10 similary scores

recommended_movies = movies.iloc[top_idx]['title'].values
# Get the title of movies based on index

print(recommended_movies)

In [None]:
sim_scores

It seems to be doing very good, The movies are very closely related

Lets add transformer for better stuff.
We know that the title itself carries huge weightage since, it may contain contexual text.

Hence using Transformers generate title embeddings. It will be useful in querrying as well.

In [None]:
from sentence_transformers import SentenceTransformer

# Small & fast model suitable for Kaggle
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for movie titles
title_embeddings = model.encode(movies['title'].tolist(), show_progress_bar=True)


In [None]:
# Recompute everything

title_sparse = csr_matrix(title_embeddings)
movie_features_with_title = hstack([movie_features, title_sparse])

# instead of computing entire feature_matrix just compute row with title found. 
# Dont do this : feature_matrix = cosine_similarity(movie_features_with_title, dense_output=False)

In [None]:
query_movie = 'Jumanji' 
# We Must give exact name for this is how recommendation systems work, they have Items not text for searching
# But for testing sake, we can use same title embeddings for matching imperfect querry titles


query_embedding = model.encode([query_movie])
# While testing, Instead of matching exact words of query title which makes it painful for users
# In production, We need to send query directly to next part

# Using title embeddings for mathcing closer title to query.
sim_scores_query = cosine_similarity(query_embedding, title_embeddings).flatten()
top_query_title_idx = np.argsort(sim_scores_query)[::-1][0]

# Get exact title of query(instead of user given approximation)
recommended_title = movies.iloc[top_query_title_idx]['title']
print("Query Title : ",recommended_title)

# The following is not necessary to find movie title now 
# movie_idx = movies[movies['title']== query_movie].index[0]

# Dont do this : sim_scores = similarity_matrix[movie_idx].toarray().flatten()

sim_scores = cosine_similarity(movie_features_with_title[top_query_title_idx], movie_features_with_title).flatten()

top_idx = np.argsort(sim_scores)[::-1][1:11]  # top-10

recommended_movies = movies.iloc[top_idx]['title'].values
print(recommended_movies)

# Done : Content based filering 