In [1]:
import pandas as pd
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [2]:
columns = ['User_ID', 'Movie_ID', 'Rating', 'Timestamp']
df = pd.read_csv('Data/ml-1m/ml-1m/ratings.dat', sep='::', header=None, names=columns, engine='python')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df.head()

Unnamed: 0,User_ID,Movie_ID,Rating,Timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [3]:
columns = ['User_ID', 'Gender', 'Age', 'Occupation', 'Zip']
users = pd.read_csv('Data/ml-1m/ml-1m/users.dat',  sep='::', header=None, names=columns, engine='python')

users

Unnamed: 0,User_ID,Gender,Age,Occupation,Zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [4]:
columns = ['Movie_ID', 'Title', 'Genre']
movies = pd.read_csv('Data/ml-1m/ml-1m/movies.dat', sep='::', header=None, names=columns, engine='python', encoding='latin1')

movies.tail()

Unnamed: 0,Movie_ID,Title,Genre
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama
3882,3952,"Contender, The (2000)",Drama|Thriller


In [5]:
# Use str.extract to extract the year from the 'Title' column
movies['Year'] = movies['Title'].str.extract(r'\((\d{4})\)', expand=False)

# Convert the 'Year' column to numeric 
movies['Year'] = pd.to_numeric(movies['Year'], errors='coerce')

# Use str.replace to remove the year string in parentheses from the 'Title' column
movies['Title'] = movies['Title'].str.replace(r'\(\d{4}\)', '').str.strip()

movies.head()

Unnamed: 0,Movie_ID,Title,Genre,Year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [23]:
# Split the 'Genre' column and one-hot encode
genres = movies['Genre'].str.get_dummies('|')

# Concatenate the new one-hot encoded columns with the original DataFrame
movies = pd.concat([movies, genres], axis=1)

# Drop the original 'Genre' column
movies = movies.drop('Genre', axis=1)

# Reorder the columns
movies = movies[['Movie_ID', 'Title', 'Year'] + list(genres.columns)]

KeyError: 'Genre'

# SVD

In [7]:
ratings = df.drop('Timestamp', axis=1)


In [21]:
from surprise.model_selection import train_test_split as surprise_train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['User_ID', 'Movie_ID', 'Rating']], reader)

# Train-test split for Surprise
trainset, testset = surprise_train_test_split(data, test_size=0.2)

# Initialize SVD model
model = SVD()

# Train the model
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

from surprise import accuracy

rmse1 = accuracy.rmse(predictions)

print(f"RMSE: {rmse1}")

RMSE: 0.8715
RMSE: 0.8714556458382792


In [9]:
params = {'n_factors': [20, 30, 40, 50, 60, 70, 80, 90, 100],
         'reg_all': [0.02, .03, .04, 0.05, .06, .07, .08, .09, 0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)

print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.8666144420086177, 'mae': 0.6820147398799006}
{'rmse': {'n_factors': 70, 'reg_all': 0.03}, 'mae': {'n_factors': 100, 'reg_all': 0.03}}


In [10]:
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [11]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([0.961297  , 0.96086059, 0.96181979, 0.96315704, 0.96283897]))
('test_mae', array([0.76513913, 0.76559245, 0.76570801, 0.76659698, 0.76608748]))
('fit_time', (362.9711437225342, 407.013614654541, 371.09008836746216, 356.2267048358917, 343.71414041519165))
('test_time', (377.50817799568176, 384.39262771606445, 391.4837284088135, 406.2984447479248, 401.2193138599396))
-----------------------
0.9619946792549076


In [12]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson', 'user_based':True})
cv_knn_baseline = cross_validate(knn_baseline,data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [13]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([0.89306662, 0.89523118, 0.89762068, 0.89657777, 0.8958211 ]))
('test_mae', array([0.70394053, 0.70640296, 0.70818346, 0.70651577, 0.70667277]))
('fit_time', (240.78212428092957, 237.14385843276978, 235.2970495223999, 245.40602278709412, 225.15195989608765))
('test_time', (350.6452376842499, 347.8793821334839, 374.47133350372314, 355.14401364326477, 313.8528971672058))


0.8956634715621867

In [16]:
svd = SVD(n_factors= 70, reg_all= .03)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22400165a90>

In [17]:
predictions2 = svd.test(testset)

In [20]:
from surprise import accuracy

rmse = accuracy.rmse(predictions2)

print(f"RMSE: {rmse}")

RMSE: 0.8688
RMSE: 0.8687751607816806


In [24]:
movies.head()

Unnamed: 0,Movie_ID,Title,Year,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
ratings.head()

Unnamed: 0,User_ID,Movie_ID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [27]:
movies_ratings = pd.merge(ratings, movies, on='Movie_ID')
movies_ratings.head()

Unnamed: 0,User_ID,Movie_ID,Rating,Title,Year,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,One Flew Over the Cuckoo's Nest,1975,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1193,5,One Flew Over the Cuckoo's Nest,1975,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12,1193,4,One Flew Over the Cuckoo's Nest,1975,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15,1193,4,One Flew Over the Cuckoo's Nest,1975,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17,1193,5,One Flew Over the Cuckoo's Nest,1975,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
users.head()

Unnamed: 0,User_ID,Gender,Age,Occupation,Zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [29]:
full_df = pd.merge(movies_ratings, users, on='User_ID')
full_df.head()

Unnamed: 0,User_ID,Movie_ID,Rating,Title,Year,Action,Adventure,Animation,Children's,Comedy,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,Gender,Age,Occupation,Zip
0,1,1193,5,One Flew Over the Cuckoo's Nest,1975,0,0,0,0,0,...,0,0,0,0,0,0,F,1,10,48067
1,1,661,3,James and the Giant Peach,1996,0,0,1,1,0,...,0,0,0,0,0,0,F,1,10,48067
2,1,914,3,My Fair Lady,1964,0,0,0,0,0,...,0,1,0,0,0,0,F,1,10,48067
3,1,3408,4,Erin Brockovich,2000,0,0,0,0,0,...,0,0,0,0,0,0,F,1,10,48067
4,1,2355,5,"Bug's Life, A",1998,0,0,1,1,1,...,0,0,0,0,0,0,F,1,10,48067


In [37]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'full_df' is your DataFrame containing the user data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(full_df[['User_ID', 'Movie_ID', 'Rating']], reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the SVD model
model = SVD(n_factors=70, reg_all=0.03)
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)


rmse = accuracy.rmse(predictions)

print(f"RMSE: {rmse}")

RMSE: 0.8672
RMSE: 0.8672018635544259


In [44]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

model.qi.shape


(3679, 70)

In [47]:
def item_to_row_idx(title, df):
    """
    Get the row index of a movie based on its title.

    Parameters:
    - title (str): The title of the movie.
    - df (pd.DataFrame): The DataFrame containing movie data.

    Returns:
    - int: The row index of the movie in the DataFrame.
    """
    try:
        return df.index[df['Title'] == title].tolist()[0]
    except IndexError:
        return None  # Return None if the movie title is not found in the DataFrame

toy_story_row_idx : int = item_to_row_idx('Toy Story', full_df)

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

qi_toy_story = model.qi[toy_story_row_idx]

# Assuming item_matrix is your item matrix from the model (model.qi)
item_matrix = model.qi

# Calculate cosine similarity between 'Toy Story' and all other items
similarities = cosine_similarity([qi_toy_story], item_matrix)

# Get indices of top 5 similar items
similar_item_indices = similarities.argsort()[0][-6:-1][::-1]

# Get titles of similar items
similar_items = full_df.loc[similar_item_indices, 'Title'].tolist()

print(f"Top 5 movies similar to 'Toy Story': {similar_items}")


Top 5 movies similar to 'Toy Story': ['Great Muppet Caper, The', 'It Came from Outer Space', 'Star Wars: Episode V - The Empire Strikes Back', 'Father of the Bride', 'Conspiracy Theory']


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_similar_movies(movie_title, df, model):
    # Helper function to get row index of a movie in the item matrix
    def item_to_row_idx(title):
        return df.index[df['Title'] == title].tolist()[0]

    # Get row index of the input movie
    movie_row_idx = item_to_row_idx(movie_title)

    # Get latent factors for the input movie
    movie_factors = model.qi[movie_row_idx]

    # Calculate cosine similarity between the input movie and all other movies
    similarities = cosine_similarity([movie_factors], model.qi)

    # Get indices of top 5 similar movies
    similar_movie_indices = similarities.argsort()[0][-6:-1][::-1]

    # Get titles of similar movies
    similar_movies = df.loc[similar_movie_indices, 'Title'].tolist()

    return similar_movies

# Example usage
movie_title = "Toy Story"
similar_movies = get_similar_movies(movie_title, full_df, model)

print(f"Top 5 movies similar to '{movie_title}': {similar_movies}")    

In [59]:
# Function to get top N movie recommendations for a given user and genre
def get_top_movie_recommendations(user_id, genre, top_n=10):
    # Create a DataFrame with all movies in the specified genre
    genre_movies = full_df[full_df[genre] == 1][['Movie_ID', 'Title', 'Year']].drop_duplicates()

    # Create a list of movies that the user has not rated
    user_rated_movies = full_df[full_df['User_ID'] == user_id]['Movie_ID']
    movies_to_recommend = genre_movies[~genre_movies['Movie_ID'].isin(user_rated_movies)]
    
    # Predict ratings for the movies the user has not rated
    movies_to_recommend['Predicted_Rating'] = movies_to_recommend['Movie_ID'].apply(lambda x: svd.predict(user_id, x).est)

    # Sort the movies by predicted rating and get the top N recommendations
    top_recommendations = movies_to_recommend.sort_values(by='Predicted_Rating', ascending=False).head(top_n)

    return top_recommendations[['Title', 'Year', 'Predicted_Rating']]

# Example usage:
user_id_to_recommend = 1
genre_to_recommend = 'Drama'  # Replace with the desired genre
top_recommendations = get_top_movie_recommendations(user_id_to_recommend, genre_to_recommend)

# Display the top recommendations
print(f"Top 10 Movie Recommendations for User {user_id_to_recommend} in Genre {genre_to_recommend}:\n")
print(top_recommendations)


Top 10 Movie Recommendations for User 1 in Genre Drama:

                                    Title  Year  Predicted_Rating
174             Shawshank Redemption, The  1994          4.745498
4627                         12 Angry Men  1957          4.714306
1028  Life Is Beautiful (La Vita è bella)  1997          4.672501
156                           October Sky  1999          4.595846
3659                     To Live (Huozhe)  1994          4.590959
1264                It's a Wonderful Life  1946          4.558146
2318               Man for All Seasons, A  1966          4.546780
1239                           Casablanca  1942          4.544530
135             Silence of the Lambs, The  1991          4.543295
4361                            Kagemusha  1980          4.537009


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_to_recommend['Predicted_Rating'] = movies_to_recommend['Movie_ID'].apply(lambda x: svd.predict(user_id, x).est)


In [None]:
import pandas as pd

def rate_movies(full_df, user_ratings_list):
    # Generate a new user ID
    new_user_id = full_df['User_ID'].max() + 1

    # Get the frequency of each movie in descending order
    movie_frequencies = full_df['Movie_ID'].value_counts().reset_index()
    movie_frequencies.columns = ['Movie_ID', 'Frequency']

    # Merge with the full_df to get additional movie information
    movies_to_rate = pd.merge(movie_frequencies, full_df[['Movie_ID', 'Title']], on='Movie_ID', how='left')

    # Sort movies by frequency in descending order
    movies_to_rate = movies_to_rate.sort_values(by='Frequency', ascending=False)

    # Create an empty list to store user ratings
    user_ratings = []

    # Display movies to the user for rating
    for index, movie_row in movies_to_rate.iterrows():
        movie_id = movie_row['Movie_ID']
        movie_title = movie_row['Title']

        # Check if the user has already rated the movie
        if any(rating['Title'] == movie_title for rating in user_ratings):
            continue

        # Ask the user to rate the movie on a scale of 1-5
        user_rating = input(f"Rate the movie '{movie_title}' on a scale of 1-5 (or enter 'skip' if you haven't seen it): ")

        # Handle the case where the user skips the movie
        if user_rating.lower() == 'skip':
            continue

        # Convert the user rating to an integer
        try:
            user_rating = int(user_rating)
        except ValueError:
            print("Invalid input. Please enter a number or 'skip'.")
            continue

        # Check if the user rating is within the valid range (1-5)
        if 1 <= user_rating <= 5:
            # Add the movie information to the list
            user_ratings.append({'User_ID': new_user_id, 'Movie_ID': movie_id, 'Title': movie_title, 'Rating': user_rating})
        else:
            print("Invalid rating. Please enter a number between 1 and 5.")

        # Check if the user has rated 10 movies
        if len(user_ratings) == 10:
            break

    # Append the user ratings to the provided user_ratings_list
    user_ratings_list.extend(user_ratings)

# Example usage:
user_ratings_list = []
rate_movies(full_df, user_ratings_list)
print("User Ratings List:", user_ratings_list)
