In [33]:
import pandas as pd
import numpy as np

### Data Processing

In [34]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [35]:
# display(movies.head(), ratings.head(), tags.head())

In [36]:
display(movies.columns, ratings.columns, tags.columns)

Index(['movieId', 'title', 'genres'], dtype='object')

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [37]:
print('Movies:', movies.shape)
print('Ratings:', ratings.shape)
print('Tags:', tags.shape)

Movies: (9742, 3)
Ratings: (100836, 4)
Tags: (3683, 4)


In [38]:
ratings.drop('timestamp', axis=1 ,inplace=True)
tags.drop('timestamp', axis=1 ,inplace=True)

In [39]:
movies['genres'] = movies['genres'].str.replace('|', ', ')

In [40]:
# avg_ratings = ratings.groupby('movieId')['rating'].mean()
# combined = pd.merge(movies, avg_ratings, on='movieId')
# combined.rename(columns={'rating': 'avg_rating'} , inplace=True)
# print(combined.shape)
# combined.head()

In [41]:
tags = tags.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index()
print(tags.shape)
tags.head()

(1572, 2)


Unnamed: 0,movieId,tag
0,1,"pixar, pixar, fun"
1,2,"fantasy, magic board game, Robin Williams, game"
2,3,"moldy, old"
3,5,"pregnancy, remake"
4,7,remake


In [42]:
combined = pd.merge(movies, tags, on='movieId', how='left')
print(combined.shape)
combined.head()

(9742, 4)


Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy","pixar, pixar, fun"
1,2,Jumanji (1995),"Adventure, Children, Fantasy","fantasy, magic board game, Robin Williams, game"
2,3,Grumpier Old Men (1995),"Comedy, Romance","moldy, old"
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance",
4,5,Father of the Bride Part II (1995),Comedy,"pregnancy, remake"


In [43]:
combined['release_year'] = combined['title'].str.split(' ').str[-1].str[1:-1]
combined['tag'] = combined['tag'] + ', ' + combined['release_year']
combined['tag'] = combined['tag'].fillna(combined['release_year'])
combined.drop('release_year', axis=1, inplace=True)
combined.head(10)

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy","pixar, pixar, fun, 1995"
1,2,Jumanji (1995),"Adventure, Children, Fantasy","fantasy, magic board game, Robin Williams, gam..."
2,3,Grumpier Old Men (1995),"Comedy, Romance","moldy, old, 1995"
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance",1995
4,5,Father of the Bride Part II (1995),Comedy,"pregnancy, remake, 1995"
5,6,Heat (1995),"Action, Crime, Thriller",1995
6,7,Sabrina (1995),"Comedy, Romance","remake, 1995"
7,8,Tom and Huck (1995),"Adventure, Children",1995
8,9,Sudden Death (1995),Action,1995
9,10,GoldenEye (1995),"Action, Adventure, Thriller",1995


In [44]:
titles = combined['title'].str[:-7].str.lower()
titles

0                                toy story
1                                  jumanji
2                         grumpier old men
3                        waiting to exhale
4              father of the bride part ii
                       ...                
9737    black butler: book of the atlantic
9738                 no game no life: zero
9739                                 flint
9740          bungo stray dogs: dead apple
9741          andrew dice clay: dice rules
Name: title, Length: 9742, dtype: object

In [45]:
overviews = combined['movieId'].to_frame()
overviews['overview'] = combined['genres'].str.lower() + ', ' + combined['tag']
overviews.head()

Unnamed: 0,movieId,overview
0,1,"adventure, animation, children, comedy, fantas..."
1,2,"adventure, children, fantasy, fantasy, magic b..."
2,3,"comedy, romance, moldy, old, 1995"
3,4,"comedy, drama, romance, 1995"
4,5,"comedy, pregnancy, remake, 1995"


In [46]:
overviews.to_csv('overviews.csv', index=False)

### User-user Collaborative Filtering

In [47]:
movie_ratings_count = ratings.groupby('movieId')['rating'].count()
popular_movies = movie_ratings_count[movie_ratings_count >= 25]

movie_ids = popular_movies.index.tolist()
filtered_ratings = ratings[ratings['movieId'].isin(movie_ids)]

filtered_ratings.shape

(62518, 3)

In [48]:
filtered_ratings.to_csv('filtered_ratings.csv', index=False)

In [49]:
user_item_matrix = filtered_ratings.pivot_table(index='userId', columns='movieId', values='rating')
print(user_item_matrix.shape)
user_item_matrix.head()

(610, 1050)


movieId,1,2,3,5,6,7,10,11,16,17,...,122904,122918,134130,134853,139385,148626,152081,164179,166528,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [50]:
user_item_matrix.to_csv('user_item_matrix.csv')

In [51]:
total_cells = user_item_matrix.shape[0] * user_item_matrix.shape[1]
missing_cells = user_item_matrix.isnull().sum().sum()
print(f'Total cells: {total_cells}')
print(f'Missing cells: {missing_cells}')
print(f'Missing percenatge: {missing_cells/total_cells:.2%}')

Total cells: 640500
Missing cells: 577982
Missing percenatge: 90.24%


In [52]:
temp = user_item_matrix.iloc[:10, :10]
temp.head()

movieId,1,2,3,5,6,7,10,11,16,17
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,4.0,,4.0,,4.0,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,4.0,,,,,,,,,


In [53]:
from scipy.stats import pearsonr

def pearson_corr_lib(user_id):
    correlation_data = []
    target_user_ratings = user_item_matrix.loc[user_id]

    for index, other_user_ratings in user_item_matrix.iterrows():
        if user_id == index: continue

        common_ratings = target_user_ratings.notna() & other_user_ratings.notna()

        if not common_ratings.any() or common_ratings.sum() < 2 or \
            target_user_ratings[common_ratings].var() == 0 or other_user_ratings[common_ratings].var() == 0:
            continue

        corr_coef, _ = pearsonr(target_user_ratings[common_ratings], other_user_ratings[common_ratings])
            
        correlation_data.append({
            'user1Id': user_id,
            'user2Id': index,
            'correlation': corr_coef
        })

    return pd.DataFrame(correlation_data)


In [54]:
def pearson_corr(user_id):
    user_item = user_item_matrix.copy()
    row_means = user_item.mean(axis=1, skipna=True)
    user_item = user_item.subtract(row_means, axis=0)

    target_user_ratings = user_item.loc[user_id]
    corr_data = []

    for index, other_user_ratings in user_item.iterrows():
        if user_id == index: continue

        common_ratings = target_user_ratings.notna() & other_user_ratings.notna()

        numerator = (target_user_ratings[common_ratings] * other_user_ratings[common_ratings]).sum()
        denominator = np.sqrt((target_user_ratings[common_ratings] ** 2).sum()) * \
            np.sqrt((other_user_ratings[common_ratings] ** 2).sum())

        if np.isclose(denominator, 0): continue

        corr_data.append({
            'user1Id': user_id,
            'user2Id': index,
            'correlation': numerator / denominator
        })

    return pd.DataFrame(corr_data)

In [55]:
def recommend_movies(correlations, n):
    user_id = correlations.iloc[0, 0]
    top_neighbors = correlations.nlargest(n * 10, 'correlation')
    neighbor_ratings = pd.merge(top_neighbors, filtered_ratings, left_on='user2Id', right_on='userId')
    movie_ratings = neighbor_ratings.groupby('movieId')['rating'].mean()
    
    target_user_ratings = filtered_ratings[filtered_ratings['userId'] == user_id]
    movies_watched = target_user_ratings['movieId'].unique()
    recommended_movies = movie_ratings[~movie_ratings.index.isin(movies_watched)].to_frame()
    # display(recommended_movies)
    
    top_recommendations = pd.merge(recommended_movies.nlargest(n, 'rating'), movies, on='movieId')
    top_recommendations['avg_rating'] = round(top_recommendations['rating'], 2)

    return top_recommendations.drop('rating', axis=1)

In [64]:
user_id, n = 214, 10

correlations = pearson_corr(user_id)
    
top_recommendations = recommend_movies(correlations, n)
top_recommendations


Unnamed: 0,movieId,title,genres,avg_rating
0,1235,Harold and Maude (1971),"Comedy, Drama, Romance",5.0
1,1299,"Killing Fields, The (1984)","Drama, War",5.0
2,1347,"Nightmare on Elm Street, A (1984)","Horror, Thriller",5.0
3,1912,Out of Sight (1998),"Comedy, Crime, Drama, Romance, Thriller",5.0
4,2020,Dangerous Liaisons (1988),"Drama, Romance",5.0
5,2872,Excalibur (1981),"Adventure, Fantasy",5.0
6,1272,Patton (1970),"Drama, War",4.83
7,3424,Do the Right Thing (1989),Drama,4.83
8,2064,Roger & Me (1989),Documentary,4.75
9,94959,Moonrise Kingdom (2012),"Comedy, Drama, Romance",4.75


### Rating prediction

In [57]:
# def predict_rating(user_item_matrix, movie_id, correlations, movie_ratings):    
#     top_neighbors = correlations.nlargest(100, 'correlation')['user2Id']
#     mov_ratings = user_item_matrix.loc[top_neighbors, movie_id]

#     correlation_values = correlations['correlation']
    
#     mult_sum, corr_sum = 0, 0
#     for rating, corr_val in zip(mov_ratings, correlation_values):
#         if np.isnan(rating): continue
        
#         mult_sum += (rating * corr_val)
#         corr_sum += corr_val

#     if corr_sum > 0:
#         predicted_rating = mult_sum / corr_sum
#     else:
#         predicted_rating = movie_ratings.loc[movie_id]

#     return round(predicted_rating, 2)

# predict_ratings = []
# for movie_id in top_recommendations['movieId']:
#     predicted_rating = predict_rating(user_item_matrix, movie_id, correlations)
#     predict_ratings.append(predicted_rating)

# top_recommendations['pred_rating'] = predict_ratings

### Content-based Filtering

In [58]:
movie_db = pd.merge(movies, user_item_matrix.columns.to_frame(index=False))
print(movie_db.shape)
movie_db.head()

(1050, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy"
1,2,Jumanji (1995),"Adventure, Children, Fantasy"
2,3,Grumpier Old Men (1995),"Comedy, Romance"
3,5,Father of the Bride Part II (1995),Comedy
4,6,Heat (1995),"Action, Crime, Thriller"


In [59]:
movie_db = pd.merge(movie_db, overviews, how='inner')
print(movie_db.shape)
movie_db.head()

(1050, 4)


Unnamed: 0,movieId,title,genres,overview
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy","adventure, animation, children, comedy, fantas..."
1,2,Jumanji (1995),"Adventure, Children, Fantasy","adventure, children, fantasy, fantasy, magic b..."
2,3,Grumpier Old Men (1995),"Comedy, Romance","comedy, romance, moldy, old, 1995"
3,5,Father of the Bride Part II (1995),Comedy,"comedy, pregnancy, remake, 1995"
4,6,Heat (1995),"Action, Crime, Thriller","action, crime, thriller, 1995"


In [60]:
movie_db.to_csv('movie_db.csv', index=False)

In [61]:
def jaccard_sim(title):
    target_movie = set(str(movie_db[movie_db['title'] == title]['overview']).split(', '))

    result = []
    for _, movie in movie_db.iterrows():
        if movie['title'] == title: continue

        other_movie = set(str(movie['overview']).split(', '))
        jaccard_similarity = len(target_movie.intersection(other_movie)) / len(target_movie.union(other_movie))
        result.append({
            'target_movie': title, 
            'other_movie': movie['title'], 
            'jaccard_sim_score': jaccard_similarity
        })

    return pd.DataFrame(result)

In [62]:
def recommend_similar_movies(similarities, n):
    movies_not_watched = user_item_matrix.columns[user_item_matrix.loc[user_id].isna()]
    movies_not_watched_titles = movie_db[movie_db['movieId'].isin(movies_not_watched)]['title'].to_frame()
    
    recommended_movies = pd.merge(similarities, movies_not_watched_titles, left_on='other_movie', right_on='title').head(n)
    recommended_movies.drop(['other_movie', 'jaccard_sim_score', 'target_movie'], axis=1, inplace=True)

    recommended_movies = pd.merge(recommended_movies, movies)
    recommended_movies = recommended_movies.reindex(columns=['movieId', 'title', 'genres'])

    return recommended_movies

In [63]:
name = 'Toy Story (1995)'
similarities = jaccard_sim(name).sort_values('jaccard_sim_score', ascending=False)
similar_movies = recommend_similar_movies(similarities, n)
similar_movies

Unnamed: 0,movieId,title,genres
0,3751,Chicken Run (2000),"Animation, Children, Comedy"
1,79091,Despicable Me (2010),"Animation, Children, Comedy, Crime"
2,745,Wallace & Gromit: A Close Shave (1995),"Animation, Children, Comedy"
3,1566,Hercules (1997),"Adventure, Animation, Children, Comedy, Musical"
4,4016,"Emperor's New Groove, The (2000)","Adventure, Animation, Children, Comedy, Fantasy"
5,152081,Zootopia (2016),"Action, Adventure, Animation, Children, Comedy"
6,1148,Wallace & Gromit: The Wrong Trousers (1993),"Animation, Children, Comedy, Crime"
7,1223,"Grand Day Out with Wallace and Gromit, A (1989)","Adventure, Animation, Children, Comedy, Sci-Fi"
8,134853,Inside Out (2015),"Adventure, Animation, Children, Comedy, Drama,..."
9,38038,Wallace & Gromit in The Curse of the Were-Rabb...,"Adventure, Animation, Children, Comedy"
