In [1]:
import pandas as pd, numpy as np
from sklearn.metrics import pairwise

In [2]:
movies=pd.read_csv("movies.csv", encoding="iso-8859-1")
ratings=pd.read_csv("ratings.csv", encoding="iso-8859-1")

In [3]:
print(movies.shape)
movies.head()

(2500, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
print(ratings.shape)
ratings.head()

(264505, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,12882,1,4.0,1147195252
1,12882,32,3.5,1147195307
2,12882,47,5.0,1147195343
3,12882,50,5.0,1147185499
4,12882,110,4.5,1147195239


In [5]:
print("number of users: ",len(ratings.userId.unique()))

number of users:  862


Now, we want to see our dataset in a more convenient form- users, movies with their ratings.

In [6]:
uv= ratings.groupby(["userId","movieId"]).mean()["rating"].unstack()
uv.head()

movieId,1,2,3,4,5,6,7,9,10,11,...,106487,106489,106782,106920,109374,109487,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,2.5,,,,,,2.0,,2.5,,...,,,,,,,,,,
320,,,,,,,,,,,...,,,,,,,,,,
359,5.0,,,,,5.0,,,4.0,4.0,...,,,,,,,,,,
370,4.5,4.0,,,,5.0,,,,,...,2.5,3.0,4.5,4.0,,,3.0,4.5,3.5,3.0
910,5.0,4.0,3.5,,3.5,3.5,,,,4.0,...,,,3.5,,,,,4.5,,


This is a nice display to have all information in one screen- User's rating to all movies.
NaN shows user hasn't rated/watched? that movie.

In [7]:
urdist=ratings.groupby(["movieId","userId"]).mean()["rating"].unstack().describe()
# user ranking distribution



In [8]:
print("Max of mininum-rating that has been given by a user ",urdist.loc["min"].max())
print("Min of maxinum-rating that has been given by a user ",urdist.loc["max"].min())

Max of mininum-rating that has been given by a user  3.5
Min of maxinum-rating that has been given by a user  4.0


It is better and wise to center mean of all the users. It means, a user who rate movies in range (1,3) might have given 3 star for good movie than to a user who rate movies in range (3,5), for whom 3 star is less than an average movie.

In [9]:
uv_norm= uv.apply(lambda col: col-col.mean(), axis=1)

In [10]:
'''This function takes in normalized user rating data[centered around mean] and gives top k neighbors similar to
target user, default is set to 30'''
def uu_similarity(norm_data,target_user,sim_type={"cosine", "pearson"},n_neighbors=30):     
    u_sim= {}
    # this dictionary will store similarity between target_user and all other users.
    if sim_type=="cosine":
        processed_data=norm_data.fillna(0) 
        #consider all nan values as 0
        for user in processed_data.index:
            if user!=target_user:
            #it will be redundant to calculate similarity with itself, since it will going to be 1 always.
                u_sim[user]= pairwise.cosine_similarity(processed_data.loc[user].reshape(1,-1), 
                                                        processed_data.loc[target_user].reshape(1,-1)).reshape(-1)[0]
                # cs= ui.uj/(|ui|*|uj|)
                #cosine similarity weighs the user similarity by the number of movies they have rated
            
    elif sim_type=="pearson":
        for user in norm_data:
            if user!=target_user:
                processed_data= norm_data.loc[[target_user, user],:].dropna(axis=1)
                #it contains only movies that target user has rated
                try:
                    u_sim[user]= (processed_data.loc[user]*processed_data.loc[target_user]).sum()/(
                        np.sqrt((processed_data.loc[user]**2).sum())*np.sqrt((processed_data.loc[target_user]**2).sum()))
                    #calculate similarity between users based on common movies they have rated
                except:
                    u_sim[user]=0
                    # if a user doesn't find any other who has rated same movies, it will give similarity of 0
    
    neighbors=pd.Series(u_sim, name="Similarity with_"+str(target_user)).sort_values(ascending=False)[:n_neighbors]
    # similarity values of all users with target user will be saved in pandas series with index as userId, it will give easy 
    #access to find similarity value with any other user
    return neighbors

In [11]:
def predict_item_rating(col, neighbors, min_neigh):
    if pd.notnull(col).sum()>=min_neigh:
    #it will rate an item only if there are minimum number of users that has rated the item
        numer=(col*neighbors).sum()
        denom=np.abs(neighbors[pd.notnull(col)]).sum()

        if denom==0: 
            return 0
        else:
            return numer/denom

'''This function will give recommendation of top-K movies based on user- user similar rating'''
def get_kTop_uuRecom(data,target_user,sim_type={"cosine", "pearson"}, n_neighbors=30, k=10, min_neighbors=2):
    
    data_norm=data.apply(lambda col: col-col.mean(), axis=1)
    # center user-rating first.
    
    neighbors= uu_similarity(data_norm,target_user= target_user,sim_type=sim_type,n_neighbors=n_neighbors)
    #we will call above function to get top n similar neighbors
    
    n_index= list(neighbors.index)
    items_to_pred= list(data_norm.loc[target_user][pd.isnull(data_norm.loc[target_user])].index)
    # this will store all items that a user has not rated
    rel_data= data_norm.loc[n_index, items_to_pred]
    # this dataset will contain top n similar neighbors to target user and movies that has not been rated by target user
    
    top_k_recom= (data.loc[target_user].mean()+rel_data.apply(lambda col: predict_item_rating(col, neighbors,min_neigh=min_neighbors), 
                                                              axis=0)).sort_values(ascending=False)[:k]
    #prediction for item rating is calculated by weighted sum of rating by similar users. Then sorted by ratings and get 
    #top k items 
    
    return top_k_recom

In [12]:
get_kTop_uuRecom(uv, 910, sim_type="cosine", n_neighbors=50, min_neighbors=5)

movieId
1203    4.888394
2019    4.882517
3307    4.882077
3265    4.868147
3871    4.802287
7502    4.800042
2966    4.797656
2010    4.791219
4406    4.789837
3462    4.781053
dtype: float64

In [13]:
get_kTop_uuRecom(uv, 910, sim_type="pearson", n_neighbors=50, min_neighbors=5)

movieId
47       4.818823
1200     4.680151
1625     4.640162
4226     4.592490
1089     4.556940
1246     4.472049
32587    4.460115
1127     4.348337
2329     4.313072
5349     4.289654
dtype: float64