In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv("./ml-latest-small/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies = pd.read_csv('./ml-latest/movies.csv', encoding='latin-1')
print (movies.shape)
movies.head()

(58098, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
num_users = ratings.userId.unique().shape[0]
num_items = ratings.movieId.unique().shape[0]
num_users, num_items

(610, 9724)

In [5]:
user_min, user_max, movie_min, movie_max = (ratings.userId.min(), ratings.userId.max(), 
                                          ratings.movieId.min(), ratings.movieId.max())
user_min, user_max, movie_min, movie_max

(1, 610, 1, 193609)

In [6]:
from sklearn import cross_validation as cv
train_set, test_set = cv.train_test_split(ratings, test_size=0.2)
len(train_set), len(test_set)



(80668, 20168)

In [7]:
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')
print("UM Matrix value size", UM_matrix_ds.values.size)
print("ratings value size", ratings.values.size)
UM_matrix_ds.head()

UM Matrix value size 5931640
ratings value size 403344


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [8]:
# similarity
import math
from operator import itemgetter
from scipy.spatial import distance

def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

def disance_corr(a,b):
    return 1-distance.correlation(a,b)

def distance_euclidean(a,b):
    return 1/(distance.euclidean(a,b)+1)

In [9]:
def nearest_neighbor_user(user, topN, simFunc) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    # Brute-Force Compute
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
            
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
        # at least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        # similarity functon
        sim = simFunc(interSectionU1, interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    # top N returned         
    return sorted(nn.items(),key=itemgetter(1),reverse=True)[:(topN+1)]
    #return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

In [10]:
import time
st=time.time()
print(nearest_neighbor_user(8, 50, distance_euclidean))
print(time.time()-st, 'sec')

[(278, 1.0), (601, 1.0), (256, 0.6666666666666666), (471, 0.6666666666666666), (338, 0.585786437626905), (88, 0.5), (295, 0.5), (343, 0.5), (355, 0.5), (378, 0.5), (421, 0.5), (504, 0.5), (526, 0.5), (70, 0.4721359549995794), (123, 0.4721359549995794), (196, 0.4494897427831781), (246, 0.4494897427831781), (260, 0.4494897427831781), (110, 0.43050087404306037), (204, 0.43050087404306037), (215, 0.43050087404306037), (79, 0.4142135623730951), (118, 0.4142135623730951), (165, 0.4142135623730951), (300, 0.4142135623730951), (384, 0.4142135623730951), (412, 0.4142135623730951), (543, 0.4142135623730951), (554, 0.4142135623730951), (20, 0.4), (80, 0.4), (131, 0.4), (189, 0.4), (420, 0.4), (511, 0.4), (581, 0.4), (50, 0.38742588672279304), (172, 0.38742588672279304), (209, 0.38742588672279304), (309, 0.38742588672279304), (460, 0.38742588672279304), (67, 0.3761785115301142), (69, 0.36602540378443865), (348, 0.36602540378443865), (417, 0.36602540378443865), (49, 0.3567891723253309), (161, 0.356

In [11]:
def predictRating(userid, nn=50, simFunc=distance_euclidean) :
   
    ## neighboorhood 
    neighbor = nearest_neighbor_user(userid, nn, simFunc)
    # userid: dictionary of similarity
    
    neighbor_id = [id for id, sim in neighbor]
    # List of similar similarities
    
    ## Remove if four or more are NaN
    neighbor_movie = UM_matrix_ds.loc[neighbor_id].dropna(1, how='all', thresh = 4 )
    # User id x DataFrame of movie id. However, if the NaN of the column is 4 or more, it is deleted.
    
    neighbor_dic = (dict(neighbor))
    ret = [] #['movieId', 'predictedRate']
    
    # iterate through each column key: userid, column: movieid, value: rating
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dic.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
       
    return ret

In [12]:
predictRating(10, 50)

[[1, 3.950267484592048],
 [2, 3.979811986075693],
 [3, 3.7627161241490796],
 [5, 3.383238977624035],
 [6, 3.623842151924668],
 [7, 3.544298112964249],
 [10, 3.278438634105372],
 [11, 4.218009787879696],
 [15, 3.1087378374683117],
 [16, 3.2752866066870943],
 [17, 3.715749533510972],
 [19, 2.805939205324938],
 [21, 2.6872527010478597],
 [22, 3.726903174436542],
 [24, 3.4761302011691093],
 [31, 3.191573551445447],
 [32, 3.5042768181876873],
 [34, 4.057807531359828],
 [36, 4.0],
 [39, 3.352792594329325],
 [41, 3.6871664255542154],
 [45, 3.0],
 [47, 3.185412191529095],
 [48, 3.5505204425115138],
 [50, 3.7719146614612225],
 [58, 3.917991694340093],
 [60, 3.559410982343283],
 [62, 3.620110565859888],
 [95, 3.510765119909787],
 [104, 3.509426240337509],
 [105, 3.1956963283276743],
 [110, 3.94579194387426],
 [135, 2.8763324697165125],
 [141, 3.5446362510880145],
 [145, 3.057471965004246],
 [150, 3.7739215772435037],
 [151, 3.2149511335990546],
 [153, 3.05714129247507],
 [158, 3.472019676563249]

In [13]:
## view rated movies and movies with high ratings
def ratingMovies(user_id):
    ds = pd.merge(ratings[ratings.userId==user_id], movies, on=['movieId'])
    return ds.sort_values(by=['rating'], ascending=False)[['rating','title','genres','movieId']].head(100)
ratingMovies(1).head(20)

Unnamed: 0,rating,title,genres,movieId
231,5.0,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War,5060
185,5.0,Excalibur (1981),Adventure|Fantasy,2872
89,5.0,Indiana Jones and the Last Crusade (1989),Action|Adventure,1291
90,5.0,Pink Floyd: The Wall (1982),Drama|Musical,1298
190,5.0,From Russia with Love (1963),Action|Adventure|Thriller,2948
189,5.0,Goldfinger (1964),Action|Adventure|Thriller,2947
188,5.0,"Dirty Dozen, The (1967)",Action|Drama|War,2944
186,5.0,Gulliver's Travels (1939),Adventure|Animation|Children,2899
184,5.0,American Beauty (1999),Drama|Romance,2858
179,5.0,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical,2700


In [14]:
def join_movie_info(predicted_result):
    predicted_ratings = pd.DataFrame(predicted_result, columns=['movieId', 'predicted_rating'])
    result_ds = pd.merge(movies[movies.movieId > 0], predicted_ratings, on=['movieId'])
    return result_ds.sort_values(by=['predicted_rating'], ascending=False)

result = predictRating(1);
join_movie_info(result)

Unnamed: 0,movieId,title,genres,predicted_rating
24,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,5.000000
15,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,5.000000
39,2858,American Beauty (1999),Drama|Romance,5.000000
49,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,4.886699
25,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,4.866430
41,3147,"Green Mile, The (1999)",Crime|Drama,4.846440
42,3578,Gladiator (2000),Action|Adventure|Drama,4.809555
55,5618,Spirited Away (Sen to Chihiro no kamikakushi) ...,Adventure|Animation|Fantasy,4.775069
2,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,4.771741
98,112552,Whiplash (2014),Drama,4.764554


In [15]:
# 5th user's star rating
userid=1
pd.merge(ratingMovies(userid), join_movie_info(predictRating(userid)), 
         on=['movieId'], how='right').sort_values(by=['predicted_rating'], ascending =False)

Unnamed: 0,rating,title_x,genres_x,movieId,title_y,genres_y,predicted_rating
3,5.0,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,5.000000
12,,,,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,5.000000
0,5.0,American Beauty (1999),Drama|Romance,2858,American Beauty (1999),Drama|Romance,5.000000
13,,,,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,4.886699
6,5.0,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,4.866430
14,,,,3147,"Green Mile, The (1999)",Crime|Drama,4.846440
8,5.0,Gladiator (2000),Action|Adventure|Drama,3578,Gladiator (2000),Action|Adventure|Drama,4.809555
15,,,,5618,Spirited Away (Sen to Chihiro no kamikakushi) ...,Adventure|Animation|Fantasy,4.775069
11,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,4.771741
16,,,,112552,Whiplash (2014),Drama,4.764554


In [16]:
eval_ratings = ratings
# ratings['userId'].drop_duplicates().values[:]
def eval_prediction(predict_users, n_users=50):
    ## evaluation
    ds = pd.merge(eval_ratings, ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                  on='movieId', how='left')

    ds = ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})

    st = time.time()
    ## udpate to predict_rating 
    distance_functions = [('euclidean',distance_euclidean), ('cosine', distance_cosine)]
    for name, func in distance_functions:
        ds[name] = 0
        for user_id in predict_users:
            # key: movie id, value : predicted_rating
            for x in predictRating(user_id, n_users, func):
                ds.loc[(ds.userId==user_id) & (ds.movieId==x[0]),name]=x[1]
    print('runtime', round(time.time()-st,2), 'sec')
    #Only the tuple corresponding to the user performing the predict in the entire dataFrame is returned
    return ds[ds.euclidean+ds.cosine>0]

In [17]:
## all userId list 
users = UM_matrix_ds.index.tolist()
## 10-person star rating
predicted = eval_prediction(users[:2], 100 )
predicted

runtime 25.91 sec


Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,euclidean,cosine
0,1,1,4.0,964982703,3.920930,4.302955,4.067384
2,1,6,4.0,964982224,3.946078,4.040658,4.500137
3,1,47,5.0,964983815,3.975369,4.663217,4.480351
4,1,50,5.0,964982931,4.237745,4.618319,4.520947
7,1,110,4.0,964982176,4.031646,4.271191,4.210624
10,1,163,5.0,964983650,3.560606,0.000000,3.998374
12,1,223,3.0,964980985,3.855769,0.000000,3.401123
13,1,231,5.0,964981179,3.060150,4.116392,4.416107
15,1,260,5.0,964981680,4.231076,4.644290,4.462436
16,1,296,3.0,964982967,4.197068,3.862645,4.273903


In [18]:
predicted = predicted[ (predicted['cosine'] > 0) & (predicted['euclidean'] > 0) ]
def RMSE(X, left_col, right_col):
    return(np.sqrt(np.mean((X[left_col] - X[right_col])**2)))

def MAE(X, left_col, right_col):
    return(np.mean(np.absolute(X[left_col] - X[right_col])) )
for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("MAE of {0} is {1} ".format(name, MAE(predicted, 'rating', name)))

for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("RMSE of {0} is {1} ".format(name, RMSE(predicted, 'rating', name)))

MAE of mean_rating is 0.6221854624109043 
MAE of cosine is 0.5192950331141716 
MAE of euclidean is 0.4496316838189158 
RMSE of mean_rating is 0.7533385768629883 
RMSE of cosine is 0.6120419252944451 
RMSE of euclidean is 0.5397726258746797 
