In [308]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import Ridge
from sklearn.ensemble import IsolationForest
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt


In [309]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')


In [310]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')[0]
ratings = ratings.dropna(subset=['rating'])
ratings = ratings.merge(movies, on='movieId')

user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

imputer = KNNImputer(n_neighbors=5)
user_movie_matrix_filled = pd.DataFrame(imputer.fit_transform(user_movie_matrix),
                                        index=user_movie_matrix.index,
                                        columns=user_movie_matrix.columns)

clf = IsolationForest(contamination=0.01)
outliers = clf.fit_predict(user_movie_matrix_filled)
user_movie_matrix_cleaned = user_movie_matrix_filled[outliers == 1]


In [311]:
movies.head()


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [312]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,1995
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,1995
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1995


In [313]:
train_matrix, test_matrix = train_test_split(user_movie_matrix_cleaned, test_size=0.3, random_state=42)


In [314]:
R = user_movie_matrix_cleaned.values
mean_user_ratings = np.mean(R, axis=1).reshape(-1, 1)
R_demeaned = R - mean_user_ratings

U, sigma, Vt = svds(R_demeaned, k=40)
sigma = np.diag(sigma)

predicted_ratings = np.dot(np.dot(U, sigma), Vt) + mean_user_ratings
predicted_ratings_df = pd.DataFrame(predicted_ratings,
                                    index=user_movie_matrix_cleaned.index,
                                    columns=user_movie_matrix_cleaned.columns)


In [315]:
movies['genres'] = movies['genres'].fillna('')
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [316]:
def content_based_recommendations(movie_id, cosine_sim, movies, top_n=10):
    if movie_id >= cosine_sim.shape[0]:
        return pd.DataFrame()
    sim_scores = list(enumerate(cosine_sim[movie_id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices]


In [317]:
user_similarities = cosine_similarity(user_movie_matrix_cleaned)


In [318]:
def user_based_recommendations(user_id, user_similarities, user_movie_matrix, top_n=10):
    user_idx = user_movie_matrix.index.get_loc(user_id)
    user_sim_scores = list(enumerate(user_similarities[user_idx]))
    user_sim_scores = sorted(user_sim_scores, key=lambda x: x[1], reverse=True)
    most_similar_users = [i[0] for i in user_sim_scores[1:top_n+1]]
    recommendations = []
    for sim_user in most_similar_users:
        sim_user_ratings = user_movie_matrix.iloc[sim_user].dropna()
        recommendations.append(sim_user_ratings)
    recommendations = pd.concat(recommendations)
    recommendations = recommendations.groupby(recommendations.index).mean().sort_values(ascending=False)
    watched_movies = user_movie_matrix.loc[user_id].dropna().index
    recommendations = recommendations.drop(watched_movies).head(top_n)
    return recommendations.index


In [319]:
def hybrid_recommendations_v3(user_id, predicted_ratings_df, movies, user_similarities, user_movie_matrix, top_n=10):
    user_ratings = predicted_ratings_df.loc[user_id].sort_values(ascending=False)
    watched_movies = user_movie_matrix.loc[user_id].dropna().index
    recommendations = user_ratings.drop(watched_movies).head(top_n)
    recommended_movies = movies[movies['movieId'].isin(recommendations.index)]

    hybrid_recommendations = pd.DataFrame()
    for movie_id in recommended_movies['movieId']:
        content_recs = content_based_recommendations(movie_id, cosine_sim, movies, top_n=top_n)
        hybrid_recommendations = pd.concat([hybrid_recommendations, content_recs])

    hybrid_recommendations = hybrid_recommendations.drop_duplicates().head(top_n)

    return hybrid_recommendations


In [353]:
def hit_ratio(user_id, recommendations, user_movie_matrix, top_n=10):
    user_watched_movies = user_movie_matrix.loc[user_id].dropna().index
    hits = [movie for movie in recommendations if movie in user_watched_movies]
    hit_ratio = len(hits) / top_n
    return hit_ratio

user_id = np.random.choice(test_matrix.index)
hybrid_recommendations_df = hybrid_recommendations_v3(user_id, predicted_ratings_df, movies, user_similarities, user_movie_matrix, top_n=10)
hybrid_movie_ids = hybrid_recommendations_df['movieId'].tolist()
hit_ratio_value = hit_ratio(user_id, hybrid_movie_ids, user_movie_matrix, top_n=10)

print("User ID:", 307)
print("Recommended Films:")
print(hybrid_recommendations_df[['title', 'genres']])
print(f'Hit Ratio: {hit_ratio_value}')


User ID: 307
Recommended Films:
                            title           genres
619         Cable Guy, The (1996)  Comedy|Thriller
1101            Underworld (1996)  Comedy|Thriller
1176      Head Above Water (1996)  Comedy|Thriller
1341             Homegrown (1998)  Comedy|Thriller
1631           Family Plot (1976)  Comedy|Thriller
1751             Mona Lisa (1986)  Comedy|Thriller
2112  Teaching Mrs. Tingle (1999)  Comedy|Thriller
4774             Foul Play (1978)  Comedy|Thriller
4934           After Hours (1985)  Comedy|Thriller
5499          High Anxiety (1977)  Comedy|Thriller
Hit Ratio: 0.3


In [382]:
user_id = np.random.choice(test_users)
hybrid_recommendations_df = hybrid_recommendations_v3(user_id, predicted_ratings_df, movies, user_similarities, user_movie_matrix, top_n=10)
hybrid_movie_ids = hybrid_recommendations_df['movieId'].tolist()
hit_ratio_value = hit_ratio(user_id, hybrid_movie_ids, user_movie_matrix, top_n=10)


print("User ID:", 606)
print("Recommended Films:")
print(hybrid_recommendations_df[['title', 'genres']])
print(f'Hit Ratio: {hit_ratio_value}')


User ID: 606
Recommended Films:
                                               title         genres
24                          Leaving Las Vegas (1995)  Drama|Romance
27                                 Persuasion (1995)  Drama|Romance
42              How to Make an American Quilt (1995)  Drama|Romance
45                      When Night Is Falling (1995)  Drama|Romance
66                               Bed of Roses (1996)  Drama|Romance
75   Once Upon a Time... When We Were Colored (1995)  Drama|Romance
76                         Angels and Insects (1995)  Drama|Romance
93             Bridges of Madison County, The (1995)  Drama|Romance
115                     Up Close and Personal (1996)  Drama|Romance
151                                  Mad Love (1995)  Drama|Romance
Hit Ratio: 0.4


In [322]:
ridge = Ridge()
param_grid = {'alpha': np.logspace(-4, 4, 20)}
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(user_movie_matrix_cleaned, user_movie_matrix_cleaned.mean(axis=1))

best_alpha = grid_search.best_params_['alpha']
print(f'Best Alpha Value: {best_alpha}')


Best Alpha Value: 0.0001
