In [1]:
import pandas as pd

# Load the CSV files
movies_df = pd.read_csv('ml-latest-small/movies.csv')
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
tags_df = pd.read_csv('ml-latest-small/tags.csv')
links_df = pd.read_csv('ml-latest-small/links.csv')

# Display the first few rows of each file to understand their structure
print(movies_df.head())
print(ratings_df.head())
print(tags_df.head())
print(links_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferre

In [2]:
# Merge ratings with movies to associate ratings with movie titles
ratings_movies_df = pd.merge(ratings_df, movies_df, on='movieId', how='inner')

# Merge with tags to include movie tags for content-based filtering
ratings_movies_tags_df = pd.merge(ratings_movies_df, tags_df, on=['userId', 'movieId'], how='left')

# Merge with links to associate external database IDs (if needed)
final_df = pd.merge(ratings_movies_tags_df, links_df, on='movieId', how='left')

# Inspect the final dataset
print(final_df.head())


   userId  movieId  rating  timestamp_x             title  \
0       1        1     4.0    964982703  Toy Story (1995)   
1       5        1     4.0    847434962  Toy Story (1995)   
2       7        1     4.5   1106635946  Toy Story (1995)   
3      15        1     2.5   1510577970  Toy Story (1995)   
4      17        1     4.5   1305696483  Toy Story (1995)   

                                        genres  tag  timestamp_y  imdbId  \
0  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   
1  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   
2  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   
3  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   
4  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  114709   

   tmdbId  
0   862.0  
1   862.0  
2   862.0  
3   862.0  
4   862.0  


In [3]:
# Save the merged DataFrame to a CSV file
final_df.to_csv('final_merged_dataset.csv', index=False)

print("Dataset merged and saved successfully!")

Dataset merged and saved successfully!


In [4]:
# Check for NaN values in each column
nan_summary = final_df.isna().sum()
print(nan_summary)

userId             0
movieId            0
rating             0
timestamp_x        0
title              0
genres             0
tag            99201
timestamp_y    99201
imdbId             0
tmdbId            13
dtype: int64


In [6]:
# Drop 'tag' and 'timestamp_y' columns, fill NaNs in 'tmdbId' with 0

final_df = final_df.drop('tag', axis=1)
final_df = final_df.drop('timestamp_y', axis=1)

final_df['tmdbId'] = final_df['tmdbId'].fillna(0)

# Check for NaN values again to confirm
print(final_df.isna().sum())

userId         0
movieId        0
rating         0
timestamp_x    0
title          0
genres         0
imdbId         0
tmdbId         0
dtype: int64


In [8]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Prepare the data for collaborative filtering
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# Use SVD for collaborative filtering
svd = SVD()
svd.fit(trainset)

# Test the model
predictions = svd.test(testset)

# Evaluate the model using RMSE
accuracy.rmse(predictions)

ModuleNotFoundError: No module named 'surprise'