In [28]:
import pandas as pd
import random
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

# Load the ratings data
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies data
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['item_id', 'title'] + [f'genre_{i}' for i in range(19)])

# Load the user data
users = pd.read_csv('ml-100k/u.user', sep='|', header=None, 
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

genres = [
    "Unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western"
]


user_interactions = ratings.groupby('user_id').size()
user_indexes = user_interactions[user_interactions > 200].index 
ratings = ratings[ratings.user_id.isin(user_indexes)]
ratings.rating = [1 if rating > 2.5 else 0 for rating in ratings.rating]
ratings

movies = movies[movies.index.get_level_values(0).isin(ratings.item_id)]
movies = movies.dropna(axis=1)
movies.columns = genres
movies["Title"] = movies.index.get_level_values(1)
movies


# Select a random user for recommendation
test_user = random.choice(ratings.user_id.unique())

# Prepare the dataset for collaborative filtering (using SVD)
reader = Reader(rating_scale=(0, 1))  # Updated rating scale for binarized ratings
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize and train the SVD model
svd = SVD(n_factors=50, n_epochs=20, lr_all=0.005)
svd.fit(trainset)

# Make predictions on the test set
predictions = svd.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE on test set: {rmse}")

# Function to get top-N recommendations
def get_top_n(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))
    
    # Sort the predictions for each user and retrieve the N highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

# Get top-3 recommendations for the selected user
top_n = get_top_n(predictions, n=3)

# Display recommendations for the selected user
print(f"\nTop-N Recommendations for User {test_user}:")
for uid, user_ratings in top_n.items():
    if uid == test_user:
        for iid, rating in user_ratings:
            if iid in movies.index:  # Ensure item_id is used as the index
                movie_data = movies.loc[iid]  # Fetch row by item_id
                movie_title = movie_data['Title']
                
                # Fetch genre columns as a series or row
                genre_values = movie_data[genres].values.flatten()  # Flatten to ensure it's a 1D array
                genres_list = [genre for genre, g in zip(genres, genre_values) if g == 1]
                
                print(f"\tItem {iid}: {movie_title} with predicted rating {rating:.2f} | Genres: {', '.join(genres_list)}")
            else:
                print(f"\tItem {iid} (Movie data not available) with predicted rating {rating:.2f}")

# Display previously rated movies for the selected user
user_ratings = ratings[ratings['user_id'] == test_user]
print(f"\nPreviously Rated Items by User {test_user}:")
for _, row in user_ratings.iterrows():
    iid = row['item_id']
    if iid in movies.index:  # Use the index to check if the movie exists
        movie_data = movies.loc[iid]
        movie_title = movie_data['Title'].values[0]
        genre_values = movie_data.iloc[0][genres].values 
        genres_list = [genre for genre, g in zip(genres, genre_values) if g == 1]
        print(f"\tItem {iid}: {movie_title} with rating {row['rating']} | Genres: {', '.join(genres_list)}")
    else:
        print(f"\tItem {iid} (Movie data not available) with rating {row['rating']}")


RMSE: 0.3534
RMSE on test set: 0.3533617718250457

Top-N Recommendations for User 271:
	Item 428: Harold and Maude (1971)  01-Jan-1971    Harold and Maude (1971)
Name: Title, dtype: object with predicted rating 1.00 | Genres: Comedy
	Item 199: Bridge on the River Kwai, The (1957)  01-Jan-1957    Bridge on the River Kwai, The (1957)
Name: Title, dtype: object with predicted rating 1.00 | Genres: Drama, War
	Item 520: Great Escape, The (1963)  01-Jan-1963    Great Escape, The (1963)
Name: Title, dtype: object with predicted rating 0.99 | Genres: Adventure, War

Previously Rated Items by User 271:
	Item 132: Wizard of Oz, The (1939) with rating 1 | Genres: Adventure, Children's, Drama, Musical
	Item 346: Jackie Brown (1997) with rating 1 | Genres: Crime, Drama
	Item 199: Bridge on the River Kwai, The (1957) with rating 1 | Genres: Drama, War
	Item 709: Strictly Ballroom (1992) with rating 1 | Genres: Comedy, Romance
	Item 378: Miracle on 34th Street (1994) with rating 1 | Genres: Drama
	I

In [45]:
import pandas as pd
import random
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from collections import Counter

# Load the ratings data
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies data
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['item_id', 'title'] + [f'genre_{i}' for i in range(19)])

# Load the user data
users = pd.read_csv('ml-100k/u.user', sep='|', header=None, 
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

genres = [
    "Unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western"
]


user_interactions = ratings.groupby('user_id').size()
user_indexes = user_interactions[user_interactions > 200].index 
ratings = ratings[ratings.user_id.isin(user_indexes)]
ratings.rating = [1 if rating > 2.5 else 0 for rating in ratings.rating]
ratings

movies = movies[movies.index.get_level_values(0).isin(ratings.item_id)]
movies = movies.dropna(axis=1)
movies.columns = genres
movies["Title"] = movies.index.get_level_values(1)
movies


# Select a random user for recommendation
test_user = random.choice(ratings.user_id.unique())

# Prepare the dataset for collaborative filtering (using SVD)
reader = Reader(rating_scale=(0, 1))  # Updated rating scale for binarized ratings
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2)

print("Performing cross-validation...")
cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

# Initialize and train the SVD model
svd = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02)
svd.fit(trainset)

# Make predictions on the test set
predictions = svd.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE on test set: {rmse}")

# Function to get top-N recommendations
# Updated get_top_n method
def get_top_n(predictions, n=10, user_id=None, rated_items=None, movies=None, genres=None):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))
    
    # Sort the predictions for each user and retrieve the N highest ones
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        if uid == user_id:  # Apply filter for the specific user
            user_ratings = [(iid, est) for iid, est in user_ratings if iid not in rated_items]
        top_n[uid] = user_ratings[:n]
    
    # Print recommendations for the target user
    if user_id is not None:
        print(f"\nTop-N Recommendations for User {user_id}:")
        if user_id in top_n:
            for iid, rating in top_n[user_id]:
                if movies is not None and genres is not None and iid in movies.index:
                    movie_data = movies.loc[iid]
                    movie_title = movie_data['Title']
                    genre_values = movie_data[genres].values
                    genres_list = [genre for genre, g in zip(genres, genre_values) if g == 1]
                    print(f"\tItem {iid}: {movie_title} with predicted rating {rating:.2f} | Genres: {', '.join(genres_list)}")
                else:
                    print(f"\tItem {iid} (Movie data not available) with predicted rating {rating:.2f}")
    return top_n


# Get top-3 recommendations for the selected user
top_n = get_top_n(predictions, n=3)

# Display recommendations for the selected user
print(f"\nTop-N Recommendations for User {test_user}:")
for uid, user_ratings in top_n.items():
    if uid == test_user:
        for iid, rating in user_ratings:
            if iid in movies.index:  # Ensure item_id is used as the index
                movie_data = movies.loc[iid]  # Fetch row by item_id
                movie_title = movie_data['Title']
                
                # Fetch genre columns as a series or row
                genre_values = movie_data[genres].values.flatten()  # Flatten to ensure it's a 1D array
                genres_list = [genre for genre, g in zip(genres, genre_values) if g == 1]
                
                print(f"\tItem {iid}: {movie_title} with predicted rating {rating:.2f} | Genres: {', '.join(genres_list)}")
            else:
                print(f"\tItem {iid} (Movie data not available) with predicted rating {rating:.2f}")

# Display previously rated movies for the selected user
user_ratings = ratings[ratings['user_id'] == test_user]
print(f"\nPreviously Rated Items by User {test_user}:")
rated_genres = []  # List to store all the genres of rated movies

for _, row in user_ratings.iterrows():
    iid = row['item_id']
    if iid in movies.index:  # Use the index to check if the movie exists
        movie_data = movies.loc[iid]
        movie_title = movie_data['Title'].values[0]
        genre_values = movie_data.iloc[0][genres].values  # Get genre values for this movie
        genres_list = [genre for genre, g in zip(genres, genre_values) if g == 1]
        
        # Print the movie details
        print(f"\tItem {iid}: {movie_title} with rating {row['rating']} | Genres: {', '.join(genres_list)}")
        
        # Add the genres of the rated movie to the list
        rated_genres.extend(genres_list)
    else:
        print(f"\tItem {iid} (Movie data not available) with rating {row['rating']}")

# Calculate preferred genres based on rated genres
genre_counts = Counter(rated_genres)  # Count occurrences of each genre
preferred_genres = genre_counts.most_common()  # List of genres sorted by frequency

print("\nPreferred Genres:")
for genre, count in preferred_genres[:5]: 
    print(f"\t{genre}: {count}")




Performing cross-validation...
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3573  0.3577  0.3551  0.3554  0.3534  0.3558  0.0016  
Fit time          0.42    0.45    0.36    0.39    0.33    0.39    0.04    
Test time         0.32    0.12    0.07    0.07    0.04    0.12    0.10    
RMSE: 0.3536
RMSE on test set: 0.3535942261917684

Top-N Recommendations for User 479:
	Item 471: Courage Under Fire (1996)  08-Mar-1996    Courage Under Fire (1996)
Name: Title, dtype: object with predicted rating 0.99 | Genres: Drama, War
	Item 180: Apocalypse Now (1979)  01-Jan-1979    Apocalypse Now (1979)
Name: Title, dtype: object with predicted rating 0.96 | Genres: Drama, War
	Item 205: Patton (1970)  01-Jan-1970    Patton (1970)
Name: Title, dtype: object with predicted rating 0.94 | Genres: Drama, War

Previously Rated Items by User 479:
	Item 680: Kull the Conqueror (1997) with rating 1 | Genres: Actio