In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
all_genres = [ "Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", 
              "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western", "IMAX", "(no genres listed)"]
def convert_genre (genre_list):
    this_movie_genre = [0] * len (all_genres)
    for genre in genre_list:
        this_movie_genre [all_genres.index(genre)] = 1
        
    return this_movie_genre

In [3]:
movies = pd.read_csv("FP_small/movies.csv")
df = pd.read_csv("FP_small/ratings.csv", usecols=['userId', 'movieId', 'rating'])
movies ["genres"] = movies ["genres"].str.split("|", expand=False)
movies['genres'] = movies['genres'].apply(convert_genre)
movies[["Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", 
    "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western", "IMAX", "NA"]] = pd.DataFrame(movies.genres.tolist(), index= movies.index)
df = pd.merge(df,movies,on='movieId')
df = df.drop (["title", "genres"], axis=1)
display (df)

Unnamed: 0,userId,movieId,rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,IMAX,NA
0,1,1,4.0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.5,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15,1,2.5,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17,1,4.5,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
100832,610,160527,4.5,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
100833,610,160836,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
100834,610,163937,3.5,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [4]:

X = df[['movieId', 'userId', "Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", 
    "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western", "IMAX", "NA"]] 
y = df['rating']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

display (X_train)
pca = PCA(n_components = 7)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)
print ("Reduced Dimension: ", X_train_reduced.shape)

Unnamed: 0,movieId,userId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,IMAX,NA
83954,41571,68,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
58268,788,84,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,0
49535,2723,115,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99924,152071,567,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
100113,6204,599,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5790,1025,509,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
66419,2402,380,1,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
42780,79091,414,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2698,356,517,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,1,0,0,0


Reduced Dimension:  (80668, 7)


In [5]:

# Define KNN model
knn = KNeighborsRegressor()

# Tuning n_neighbors hyperparameter
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11]  
}

# Grid search cross-validation using MAE
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_median_absolute_error')
grid_search.fit(X_train_reduced, y_train)

# Best hyperparameters
best_params = grid_search.best_params_

# Testing the model with best hyperparameters
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test_reduced)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("Best Hyperparameters:", best_params)
print("Mean Absolute Error:", mae)
print("Root Mean Squared Error:", rmse)

Best Hyperparameters: {'n_neighbors': 9}
Mean Absolute Error: 0.8127616906871171
Root Mean Squared Error: 1.0371791699389206


In [6]:
from sklearn.neighbors import NearestNeighbors
X_train_df, X_test_df = train_test_split(df, test_size = 0.2)

# Initialize KNN model
knn = NearestNeighbors(n_neighbors=best_params['n_neighbors'])

knn.fit(X_train_df)

# Find k-nearest neighbors
distances, Recommendations = knn.kneighbors(X_test_df)

In [7]:
print ("Recommendations for all movies in Test set found.\nShowing the recommendation for a random movie from test set:\n")
random_test = (X_test_df.sample())
distances, indices = knn.kneighbors(random_test)
movieId_sample = X_test_df["movieId"][random_test.index].item()
print ("Because you watched: ",movies.loc[movies['movieId'] == movieId_sample] ['title'].item(), "\nYou can also try: ")
recommendation = []
for i in indices [0]:
    recommendation.append (X_train_df.iloc[i]['movieId'])

recommendation = list(set(recommendation))
recommendation.remove(movieId_sample) if movieId_sample in recommendation else None 
for mov in   recommendation:
    print ("  ", movies.loc[movies['movieId'] == mov] ['title'].item())

Recommendations for all movies in Test set found.
Showing the recommendation for a random movie from test set:

Because you watched:  Apollo 13 (1995) 
You can also try: 
   Brothers McMullen, The (1995)
   Batman Forever (1995)


# Matrix Factorisation Method:

In [8]:
import pandas as pd
import numpy as np

In [9]:
movies = pd.read_csv('FP_small/movies.csv')
ratings = pd.read_csv('FP_small/ratings.csv')

In [10]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [11]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [12]:
print(movies['genres'].dtype)

# Convert genres column to string 
movies['genres'] = movies['genres'].astype(str)


movies['genres'] = movies['genres'].str.split('|').str.get(0)

# Mapping genres to unique indices
genres = dict([(attribute_name, idx) for idx, attribute_name in enumerate(set(movies['genres']))])
movies['genres'] = movies['genres'].map(lambda x: genres[x])

# Merging movies and ratings dataframes
movies_ratings = pd.merge(ratings, movies, on='movieId')

# Displaying outputs
display(movies_ratings.head())

display(movies.head())

print("\nIs there NaN values in the table?", movies_ratings.isnull().values.any())

object


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),14
1,5,1,4.0,847434962,Toy Story (1995),14
2,7,1,4.5,1106635946,Toy Story (1995),14
3,15,1,2.5,1510577970,Toy Story (1995),14
4,17,1,4.5,1305696483,Toy Story (1995),14


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),14
1,2,Jumanji (1995),14
2,3,Grumpier Old Men (1995),5
3,4,Waiting to Exhale (1995),5
4,5,Father of the Bride Part II (1995),5



Is there NaN values in the table? False


In [13]:
combine_rating = pd.merge(ratings, movies, on='movieId')
combine_rating.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),14
1,5,1,4.0,847434962,Toy Story (1995),14
2,7,1,4.5,1106635946,Toy Story (1995),14
3,15,1,2.5,1510577970,Toy Story (1995),14
4,17,1,4.5,1305696483,Toy Story (1995),14
5,18,1,3.5,1455209816,Toy Story (1995),14
6,19,1,4.0,965705637,Toy Story (1995),14
7,21,1,3.5,1407618878,Toy Story (1995),14
8,27,1,3.0,962685262,Toy Story (1995),14
9,31,1,5.0,850466616,Toy Story (1995),14


In [14]:
columns = ['timestamp', 'genres']
combine_rating = combine_rating.drop(columns, axis=1)
combine_rating.head(10)

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)
5,18,1,3.5,Toy Story (1995)
6,19,1,4.0,Toy Story (1995)
7,21,1,3.5,Toy Story (1995)
8,27,1,3.0,Toy Story (1995)
9,31,1,5.0,Toy Story (1995)


In [15]:
combine_rating = combine_rating.dropna(axis = 0, subset = ['title'])

movie_ratingCount = (combine_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head(10)

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
5,'Tis the Season for Love (2015),1
6,"'burbs, The (1989)",17
7,'night Mother (1986),1
8,(500) Days of Summer (2009),42
9,*batteries not included (1987),7


In [16]:
rating_totalRatingCount = combine_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_totalRatingCount.head(10)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215
5,18,1,3.5,Toy Story (1995),215
6,19,1,4.0,Toy Story (1995),215
7,21,1,3.5,Toy Story (1995),215
8,27,1,3.0,Toy Story (1995),215
9,31,1,5.0,Toy Story (1995),215


In [17]:
user_rating = rating_totalRatingCount.drop_duplicates(['userId','title'])
user_rating.head(10)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215
5,18,1,3.5,Toy Story (1995),215
6,19,1,4.0,Toy Story (1995),215
7,21,1,3.5,Toy Story (1995),215
8,27,1,3.0,Toy Story (1995),215
9,31,1,5.0,Toy Story (1995),215


In [18]:
movie_rating_pivot = user_rating.pivot(index = 'userId', columns = 'title', values = 'rating').fillna(0)
movie_rating_pivot.head(10)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(movies_ratings, test_size=0.25)

In [20]:
X = movie_rating_pivot.values.T
X.shape

(9719, 610)

In [21]:
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
matrix.shape

(9719, 12)

In [22]:
import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)
corr = np.corrcoef(matrix)
corr.shape

(9719, 9719)

In [28]:
def check_recommendation(movie_name, corr_matrix, movie_titles):
    try:
        movie_index = movie_titles.index(movie_name)
        corr_values = corr_matrix[movie_index]
        similar_movies = [movie_titles[i] for i, corr_value in enumerate(corr_values) if corr_value >= 0.9]
        
        if similar_movies:
            print(f"Because you have watched {movie_name}:")
            for similar_movie in similar_movies:
                print(similar_movie)
        else:
            print(f"No highly correlated movies found for {movie_name}.")
    except ValueError:
        print(f"Movie '{movie_name}' not found in the dataset.")

print ("Try copying the movie name from the pandas datafram displayed above for exact name")
movie_name = input("Enter the name of the movie you've watched: ")
check_recommendation(movie_name, corr, list(movie_rating_pivot.columns))


Try copying the movie name from the pandas datafram displayed above for exact name
Enter the name of the movie you've watched: 'Salem's Lot (2004)
Because you have watched 'Salem's Lot (2004):
'Salem's Lot (2004)
All This, and Heaven Too (1940)
Carnal Knowledge (1971)
Howling, The (1980)
Incredible Journey, The (1963)
Laura (1944)
Psycho II (1983)


In [25]:
# Number of unique users and movies based on all available data in ratings
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()


# Adjusted matrix sizes to accommodate the maximum ID, not just the count
max_user_id = ratings['userId'].max() + 1
max_movie_id = ratings['movieId'].max() + 1

# Initialize matrices for training and testing data
train_data_matrix = np.zeros((max_user_id, max_movie_id))
test_data_matrix = np.zeros((max_user_id, max_movie_id))

# Create mappings for user and movie indices based on all ratings to ensure complete coverage
user_index = {user_id: idx for idx, user_id in enumerate(ratings['userId'].unique())}
movie_index = {movie_id: idx for idx, movie_id in enumerate(ratings['movieId'].unique())}

# Populate the training matrix
for line in train_data.itertuples():
    if line.userId in user_index and line.movieId in movie_index:  # Check if the IDs are in the index maps
        train_data_matrix[user_index[line.userId], movie_index[line.movieId]] = line.rating

# Populate the testing matrix
for line in test_data.itertuples():
    if line.userId in user_index and line.movieId in movie_index:  # Check if the IDs are in the index maps
        test_data_matrix[user_index[line.userId], movie_index[line.movieId]] = line.rating

print("Training data:")
print(train_data_matrix)
print("Test data:")
print(test_data_matrix)


Training data:
[[4. 0. 4. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [3. 0. 0. ... 0. 0. 0.]
 [5. 0. 5. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Test data:
[[0. 4. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [26]:
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

u, s, vt = svds(train_data_matrix, k = 5)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

# Defining RMSE and MAE functions
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def mae(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return mean_absolute_error(prediction, ground_truth)

# Calculate and print RMSE and MAE
test_rmse = rmse(X_pred, test_data_matrix)
test_mae = mae(X_pred, test_data_matrix)
print(f'SVD-based CF RMSE: {test_rmse}')
print(f'SVD-based CF MAE: {test_mae}')

SVD-based CF RMSE: 3.0098611098649846
SVD-based CF MAE: 2.785435469786915
