In [None]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
import random

# Load the ratings data
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies data
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None, 
                     names=['item_id', 'title'] + [f'genre_{i}' for i in range(19)])

import pandas as pd

# Merge the two datasets on movie_id to get the movie titles alongside the ratings
merged_data = pd.merge(ratings, movies, on='item_id', how='left')

# Preview the merged data
print(merged_data[['user_id', 'item_id', 'rating', 'title']].head())


# Fill missing genres with 0 and ensure correct type
genre_columns = [f'genre_{i}' for i in range(19)]
movies[genre_columns] = movies[genre_columns].fillna(0).astype(int)

# Select a random user for recommendation
test_user = random.choice(ratings['user_id'].unique())

# Prepare the dataset for collaborative filtering (using SVD)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize and train the SVD model
svd = SVD(n_factors=50, n_epochs=20, lr_all=0.005)
svd.fit(trainset)

# Make predictions on the test set
predictions = svd.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE on test set: {rmse}")

# Function to get top-N recommendations
def get_top_n(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))
    
    # Sort the predictions for each user and retrieve the N highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

# Get top-3 recommendations for the selected user
top_n = get_top_n(predictions, n=3)

# Check for missing movies in recommendations and previously rated items
def check_missing_movies(item_ids, movies_df):
    missing_items = []
    for item_id in item_ids:
        if not movies_df[movies_df['item_id'] == item_id].empty:
            continue  # Movie exists in the dataset
        else:
            missing_items.append(item_id)
    return missing_items

recommended_items = [iid for _, user_ratings in top_n.items() for iid, _ in user_ratings]
previously_rated_items = ratings[ratings['user_id'] == test_user]['item_id'].unique()

missing_recommendations = check_missing_movies(recommended_items, movies)
missing_ratings = check_missing_movies(previously_rated_items, movies)

print(f"Missing Movies in Recommendations: {missing_recommendations}")
print(f"Missing Movies in Previously Rated Items: {missing_ratings}")

# Display recommendations for the selected user
print(f"\nTop-N Recommendations for User {test_user}:")
for uid, user_ratings in top_n.items():
    if uid == test_user:
        for iid, rating in user_ratings:
            movie_details = movies[movies['item_id'] == iid]
            if not movie_details.empty:
                movie_title = movie_details['title'].values[0]
                genres = movie_details[genre_columns].values[0]
                genres_list = [f"Genre {i}" for i, g in enumerate(genres) if g == 1]
                print(f"\tItem {iid}: {movie_title} with predicted rating {rating:.2f} | Genres: {', '.join(genres_list)}")
            else:
                print(f"\tItem {iid} (Movie data not available) with predicted rating {rating:.2f}")

# Display previously rated movies for the selected user
user_ratings = ratings[ratings['user_id'] == test_user]
print(f"\nPreviously Rated Items by User {test_user}:")
for _, row in user_ratings.iterrows():
    movie_details = movies[movies['item_id'] == row['item_id']]
    if not movie_details.empty:
        movie_title = movie_details['title'].values[0]
        genres = movie_details[genre_columns].values[0]
        genres_list = [f"Genre {i}" for i, g in enumerate(genres) if g == 1]
        print(f"\tItem {row['item_id']}: {movie_title} with rating {row['rating']} | Genres: {', '.join(genres_list)}")
    else:
        print(f"\tItem {row['item_id']} (Movie data not available) with rating {row['rating']}")

   user_id  item_id  rating title
0      196      242       3   NaN
1      186      302       3   NaN
2       22      377       1   NaN
3      244       51       2   NaN
4      166      346       1   NaN
RMSE: 0.9333
RMSE on test set: 0.9332714584015833
Missing Movies in Recommendations: [64, 661, 187, 100, 224, 268, 238, 98, 168, 174, 12, 607, 1, 237, 471, 302, 137, 1084, 64, 357, 98, 173, 659, 98, 12, 651, 313, 174, 22, 56, 69, 22, 202, 173, 657, 178, 127, 134, 272, 427, 316, 190, 515, 357, 272, 269, 173, 654, 496, 174, 79, 96, 237, 246, 166, 127, 1073, 7, 172, 22, 313, 174, 246, 346, 887, 1293, 313, 223, 22, 56, 127, 185, 513, 98, 96, 166, 170, 14, 50, 79, 204, 318, 657, 174, 333, 292, 876, 23, 603, 197, 408, 285, 475, 174, 12, 114, 474, 191, 64, 408, 89, 228, 603, 357, 196, 50, 79, 8, 169, 144, 427, 515, 181, 15, 474, 513, 100, 603, 427, 166, 519, 498, 175, 98, 515, 647, 181, 651, 196, 194, 651, 181, 603, 64, 12, 181, 147, 742, 603, 64, 12, 114, 168, 651, 183, 135, 523, 318, 98, 60