In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the CSV files
movies_df = pd.read_csv('ml-latest-small/movies.csv')
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
tags_df = pd.read_csv('ml-latest-small/tags.csv')
links_df = pd.read_csv('ml-latest-small/links.csv')

# Display the first few rows of each file to understand their structure
print(movies_df.head())
print(ratings_df.head())
print(tags_df.head())
print(links_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferre

In [3]:
# Merge ratings with movies to associate ratings with movie titles
ratings_movies_df = pd.merge(ratings_df, movies_df, on='movieId', how='inner')

# Merge with tags to include movie tags for content-based filtering
ratings_movies_tags_df = pd.merge(ratings_movies_df, tags_df, on=['userId', 'movieId'], how='left')

# Merge with links to associate external database IDs (if needed)
final_df = pd.merge(ratings_movies_tags_df, links_df, on='movieId', how='left')

# Inspect the final dataset
print(final_df.head())


   userId  movieId  rating  timestamp_x             title  \
0       1        1     4.0    964982703  Toy Story (1995)   
1       5        1     4.0    847434962  Toy Story (1995)   
2       7        1     4.5   1106635946  Toy Story (1995)   
3      15        1     2.5   1510577970  Toy Story (1995)   
4      17        1     4.5   1305696483  Toy Story (1995)   

                                        genres  tag  timestamp_y  imdbId  \
0  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   
1  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   
2  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   
3  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   
4  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   

   tmdbId  
0   862.0  
1   862.0  
2   862.0  
3   862.0  
4   862.0  


In [4]:
# Check for NaN values in each column
nan_summary = final_df.isna().sum()
print(nan_summary)

userId             0
movieId            0
rating             0
timestamp_x        0
title              0
genres             0
tag            99201
timestamp_y    99201
imdbId             0
tmdbId            13
dtype: int64


In [5]:
# Drop 'tag' and 'timestamp_y' columns, fill NaNs in 'tmdbId' with 0

final_df.drop(columns=['tag', 'timestamp_y'], inplace=True)

final_df['tmdbId'].fillna(0, inplace=True)

# Check for NaN values again to confirm
print(final_df.isna().sum())

userId         0
movieId        0
rating         0
timestamp_x    0
title          0
genres         0
imdbId         0
tmdbId         0
dtype: int64


In [6]:
# Create a user-item matrix
user_item_matrix = final_df.pivot_table(index='userId', columns='movieId', values='rating')

# Fill NaN values with 0 (no rating)
user_item_matrix = user_item_matrix.fillna(0)

In [7]:
# Compute the cosine similarity matrix
user_similarity = cosine_similarity(user_item_matrix)

# Convert to a DataFrame for easier access
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [8]:
def get_user_based_recommendations(user_id, num_recommendations=5):
    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:num_recommendations + 1].index

    # Get the movies rated by similar users
    similar_user_ratings = user_item_matrix.loc[similar_users]

    # Calculate the mean rating for each movie
    mean_ratings = similar_user_ratings.mean(axis=0)

    # Get the top movie recommendations
    recommended_movies = mean_ratings.sort_values(ascending=False).head(num_recommendations)

    # Return the recommended movie titles
    return final_df[final_df['movieId'].isin(recommended_movies.index)]['title'].values

# Example: Get recommendations for user ID 1
print(get_user_based_recommendations(1))


['Pulp Fiction (1994)' 'Pulp Fiction (1994)' 'Pulp Fiction (1994)' ...
 'Aliens (1986)' 'Aliens (1986)' 'Aliens (1986)']


In [11]:
from scipy.sparse import csr_matrix

sampled_df = final_df.sample(frac=0.1, random_state=42)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(sampled_df['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_content_based_recommendations(title):
    idx = final_df[final_df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return final_df['title'].iloc[movie_indices]
print(get_content_based_recommendations('Toy Story (1995)'))

36            Toy Story (1995)
237    Grumpier Old Men (1995)
256    Grumpier Old Men (1995)
304                Heat (1995)
361                Heat (1995)
Name: title, dtype: object
