In [2]:
import numpy as np
import pandas as pd
ratings_csv=pd.read_csv("ratings.csv")
print(ratings_csv.head())
movies=pd.read_csv("movies.csv")
print(movies.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [3]:
num_of_movies=len(movies['movieId'])
print(num_of_movies)

9742


In [4]:
#rating_dic is dictionary of userid to rated movies
userIds=ratings_csv['userId']
movieIds=ratings_csv['movieId']
ratings=ratings_csv['rating']
rating_dic={}
for i in range(len(movieIds)):
    idx=userIds[i]
    if idx not in rating_dic:
        rating_dic[idx]={}
    rating_dic[idx][movieIds[i]]=ratings[i]

num_of_users=len(rating_dic)
print(num_of_users)
        
    

610


In [5]:
from math import sqrt
def sim_pearson(prefs,p1,p2):

    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: 
            si[item]=1

    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0

    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])

    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    return r
def sim_cosine(prefs,p1,p2):
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: 
            si[item]=1

    n=len(si)
    # if they are no ratings in common, return 0
    if n==0: return 0
    v=[[prefs[p1][it],prefs[p2][it]] for it in si]
    v1=[v[i][0] for i in range(len(v))]
    v2=[v[i][1] for i in range(len(v))]
    dot=sum([v1[i]*v2[i] for i in range(len(v))])
    norm_prod=sqrt(sum([v1[i]**2 for i in range(len(v))])*sum([v2[i]**2 for i in range(len(v))]))
    return dot/norm_prod
print(sim_pearson(rating_dic,1,9))
print(sim_cosine(rating_dic,1,9))

0.9185586535436888
0.9957385837170336


In [6]:
def recommend(prefs,p,movies,sim):
    if sim=='pearson':
        sim_vector=[[i,sim_pearson(prefs,p,i)] for i in range(1,num_of_users) if i!=p]
    elif sim=='cosine':
        sim_vector=[[i,sim_cosine(prefs,p,i)] for i in range(1,num_of_users) if i!=p]
    ans=[]
    for movie in range(num_of_movies):
        v1=[]
        v2=[]
        for elem in sim_vector:
            person=elem[0]
            if movie in prefs[person] and elem[1]>=0:
                v1+=[elem[1]]
                v2+=[prefs[person][movie]]
        if sum(v1)==0 or len(v1)==1 or len(v1)==2 or movie in prefs[p]:      #removing the outliers
            pred=0
        else:
        
            pred=sum([v1[i]*v2[i] for i in range(len(v1))])/sum(v1)
        ans+=[[pred,movies['title'][movie]]]
    return np.array(sorted(ans)[::-1][:10])  
        
print('movies recommended for user 1 by pearson score are:\n',recommend(rating_dic,1,movies,'pearson'))
print("\n\n")
print('movies recommended for user 1 by cosine score are:\n',recommend(rating_dic,1,movies,'cosine'))


movies recommended for user 1 by pearson score are:
 [['4.964155252569405' 'Breaking Away (1979)']
 ['4.924833341228152' 'How the Grinch Stole Christmas! (1966)']
 ['4.910022746571027' 'Shooting Dogs (a.k.a. Beyond the Gates) (2005)']
 ['4.909769656306572' "We're No Angels (1989)"]
 ['4.861959798883314' 'Secretary (2002)']
 ['4.8433562735823426' 'Blow (2001)']
 ['4.7889449088176725' 'White Ribbon, The (Das weiße Band) (2009)']
 ['4.76236771827461' 'Mirror Has Two Faces, The (1996)']
 ['4.755912013559737' 'Glitter (2001)']
 ['4.739976156088092' 'Vampire in Brooklyn (1995)']]



movies recommended for user 1 by cosine score are:
 [['4.900434532065509' 'How the Grinch Stole Christmas! (1966)']
 ['4.748453796931103' 'White Ribbon, The (Das weiße Band) (2009)']
 ['4.744437050535413' 'Unlawful Entry (1992)']
 ['4.669271058712822' 'Shooting Dogs (a.k.a. Beyond the Gates) (2005)']
 ['4.66886320100839' "Dante's Peak (1997)"]
 ['4.668602449814999' 'Breaking Away (1979)']
 ['4.665137929475486' 'B

In [7]:
ans=[movies['title'][movie] for movie in rating_dic[1]]

In [9]:
'How the Grinch Stole Christmas! (1966)' in ans

False