In [2]:
import pandas as pd

In [3]:
#load ratings
ratings = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

#load movie titles
movies = pd.read_csv("../data/ml-100k/u.item", sep='|', encoding='latin-1', 
                     header=None, usecols=[0, 1], names=['item_id', 'title'])

#merge
df = pd.merge(ratings, movies, on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [4]:
#create user-item rating matrix, 
# rows = users, columns = movie titles, values = ratings
user_movie_matrix = df.pivot_table(index='user_id', columns='title', values='rating')
user_movie_matrix

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,3.0,,3.0,...,,,,,,,,,,


In [5]:
#check shape and sparsity
print("Matrix shape:", user_movie_matrix.shape)
user_movie_matrix.head()

Matrix shape: (943, 1664)


title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [6]:
#Compute item-to-item similarity
target_movie = "Star Wars (1977)"
target_ratings = user_movie_matrix[target_movie]
similar_movies = user_movie_matrix.corrwith(target_ratings)
similar_movies

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


title
'Til There Was You (1997)                0.872872
1-900 (1994)                            -0.645497
101 Dalmatians (1996)                    0.211132
12 Angry Men (1957)                      0.184289
187 (1997)                               0.027398
                                           ...   
Young Guns II (1990)                     0.228615
Young Poisoner's Handbook, The (1995)   -0.007374
Zeus and Roxanne (1997)                  0.818182
unknown                                  0.723123
Á köldum klaka (Cold Fever) (1994)            NaN
Length: 1664, dtype: float64

In [7]:
#Drop NaNs and convert to dataframe
corr_df = pd.DataFrame(similar_movies, columns=['correlation'])
corr_df.dropna(inplace=True)

In [8]:
#Add rating count for quality filtering
rating_counts = df.groupby('title')['rating'].count()
corr_df['rating_count'] = rating_counts

In [9]:
reliable_recs = corr_df[corr_df['rating_count'] >= 50].sort_values('correlation', ascending=False)

In [11]:
reliable_recs.head(10)

Unnamed: 0_level_0,correlation,rating_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Star Wars (1977),1.0,583
"Empire Strikes Back, The (1980)",0.747981,367
Return of the Jedi (1983),0.672556,507
Raiders of the Lost Ark (1981),0.536117,420
Giant (1956),0.488093,51
"Life Less Ordinary, A (1997)",0.411638,53
Austin Powers: International Man of Mystery (1997),0.377433,130
"Sting, The (1973)",0.367538,241
Indiana Jones and the Last Crusade (1989),0.350107,331
Pinocchio (1940),0.347868,101


In [12]:
#General Purpose Recommendation Function
def recommend_similar_movies(title, n=5, min_ratings=50):
    if title not in user_movie_matrix.columns:
        return "Movie not found in ratings matrix."
    
    target_ratings = user_movie_matrix[title]
    similar_scores = user_movie_matrix.corrwith(target_ratings)
    
    corr_df = pd.DataFrame(similar_scores, columns=['correlation'])
    corr_df.dropna(inplace=True)
    
    rating_counts = df.groupby('title')['rating'].count()
    corr_df['rating_count'] = rating_counts
    
    filtered = corr_df[corr_df['rating_count'] >= min_ratings]
    
    top_matches = filtered.sort_values('correlation', ascending=False)
    top_matches = top_matches[top_matches.index != title]
    
    return top_matches.head(n)

In [13]:
recommend_similar_movies("Star Wars (1977)", n=5)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,correlation,rating_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Empire Strikes Back, The (1980)",0.747981,367
Return of the Jedi (1983),0.672556,507
Raiders of the Lost Ark (1981),0.536117,420
Giant (1956),0.488093,51
"Life Less Ordinary, A (1997)",0.411638,53


## User-based Recommender System

In [14]:
user_ratings = user_movie_matrix.copy()

In [15]:
user_ratings = user_ratings.sub(user_ratings.mean(axis=1), axis=0)
user_ratings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,-1.605166,1.394834,,,-0.605166,0.394834,,,...,,,,1.394834,-0.605166,,,,0.394834,
2,,,,,,,,,-2.704918,,...,,,,,,,,,,
3,,,,,-0.773585,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,-0.874286,,,,,1.125714,,,...,,,,1.125714,,,,,1.125714,


In [16]:
#Transpose so we correlate rows (users)
user_similarity = user_ratings.T.corr()
print("Shape of user similarity matrix: ", user_similarity.shape)

Shape of user similarity matrix:  (943, 943)


In [21]:
import numpy as np

def predict_ratings_for_user(user_id, n=5, min_sim=0.1):
    if user_id not in user_ratings.index:
        return "User not found."
    
    rated_movies = user_ratings.loc[user_id].dropna().index
    sim_scores = user_similarity[user_id]
    sim_scores = sim_scores.drop(user_id)
    sim_scores = sim_scores[sim_scores > min_sim]
    
    weighted_sum = pd.Series(dtype=float)
    sim_sums = pd.Series(dtype=float)
    
    for other_user, sim in sim_scores.items():
        other_ratings = user_ratings.loc[other_user]
        for movie, rating in other_ratings.dropna().items():
            if movie not in rated_movies:
                weighted_sum[movie] = weighted_sum.get(movie, 0) + rating*sim
                sim_sums[movie] = sim_sums.get(movie, 0) + abs(sim)
                
                
    predict_ratings = (weighted_sum/sim_sums).dropna()
    
    return predict_ratings.sort_index(ascending=False).head(n)

In [22]:
predict_ratings_for_user(42, n=5)

Á köldum klaka (Cold Fever) (1994)       0.095870
unknown                                  0.098316
Zeus and Roxanne (1997)                 -0.670777
Young Poisoner's Handbook, The (1995)   -0.097031
Young Guns II (1990)                    -0.770490
dtype: float64