In [1]:
import pandas as pd
import numpy as np
import time

In [46]:
ratings = pd.read_csv("./ml-latest-small/ratings.csv")
ratings[ratings.userId==1].head(20)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [3]:
movies = pd.read_csv('./ml-latest/movies.csv', encoding='latin-1')
print (movies.shape)
movies.head()

(58098, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
num_users = ratings.userId.unique().shape[0]
num_items = ratings.movieId.unique().shape[0]
num_users, num_items

(610, 9724)

In [5]:
from sklearn import model_selection as ms
train_set, test_set = ms.train_test_split(ratings, test_size=0.25)
len(train_set), len(test_set)

(75627, 25209)

In [6]:
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')
print("UM Matrix value size", UM_matrix_ds.values.size)
print("ratings value size", ratings.values.size)
print("Sparisity: {:4.2f}%".format(ratings.rating.values.size/UM_matrix_ds.values.size*100))
UM_matrix_ds.head(10)

UM Matrix value size 5931640
ratings value size 403344
Sparisity: 1.70%


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# similarity
import math
from operator import itemgetter
from scipy.spatial import distance

def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

def disance_corr(a,b):
    return 1-distance.correlation(a,b)

def distance_euclidean(a,b):
    return 1/(distance.euclidean(a,b)+1)

In [8]:
def nearest_neighbor_user(user, topN, simFunc) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    # Brute-Force Compute
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
            
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
        # at least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        # similarity functon
        sim = simFunc(interSectionU1, interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    # top N returned         
    return sorted(nn.items(),key=itemgetter(1),reverse=True)[:(topN+1)]
    #return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

In [9]:
def nearest_neighbor_item(movie, topN, simFunc):
    m1 = UM_matrix_ds[movie].dropna()
    user_index = m1.index
    nn = {}
    
    for movieId, col in UM_matrix_ds.iteritems():
        interSectionU1 = []
        interSectionU2 = []
        if movie == movieId:
            continue
        
        for i in user_index:
            if False==math.isnan(col[i]):
                interSectionU1.append(m1[i])
                interSectionU2.append(col[i])
        interSectionLen = len(interSectionU1)
        # at least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        # similarity functon
        sim = simFunc(interSectionU1, interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[movieId] = sim
            
    # top N returned         
    return sorted(nn.items(),key=itemgetter(1),reverse=True)[:(topN+1)]
        
        

In [10]:
print(pd.DataFrame(nearest_neighbor_user(278, 100, distance_euclidean), columns=['userId', 'sim']))
print(pd.DataFrame(nearest_neighbor_item(1, 100, distance_euclidean), columns=['movieId', 'sim']))

     userId       sim
0         8  1.000000
1        15  1.000000
2       348  1.000000
3       352  1.000000
4       373  1.000000
5       382  1.000000
6       512  1.000000
7       566  1.000000
8       601  1.000000
9       602  1.000000
10        1  0.666667
11      296  0.666667
12      338  0.666667
13      484  0.666667
14      515  0.666667
15      572  0.666667
16       17  0.585786
17       59  0.585786
18       72  0.585786
19      178  0.585786
20      232  0.585786
21      254  0.585786
22      280  0.585786
23      282  0.585786
24      330  0.585786
25      343  0.585786
26      415  0.585786
27      418  0.585786
28      489  0.585786
29      587  0.585786
..      ...       ...
71      200  0.387426
72      202  0.387426
73      219  0.387426
74      239  0.387426
75      249  0.387426
76      152  0.376179
77      420  0.376179
78      590  0.376179
79      356  0.366025
80      105  0.356789
81      318  0.356789
82      434  0.356789
83      452  0.356789
84      55

In [48]:
def predictRating(id_num, nn=50, simFunc=distance_euclidean, kind = 'user') :
   
    if (kind == "user"):
        ## neighboorhood 
        neighbor = nearest_neighbor_user(id_num, nn, simFunc)
        # userid: dictionary of similarity

        neighbor_id = [id for id, sim in neighbor]
        # List of similar similarities

        ## Remove if four or more are NaN
        neighbor_movie = UM_matrix_ds.loc[neighbor_id].dropna(1, how = 'all', thresh = 2)
        # User id x DataFrame of movie id. However, if the NaN of the column is 4 or more, it is deleted.

        neighbor_dic = (dict(neighbor))
        ret = [] #['movieId', 'predictedRate']

        # iterate through each column key: userid, column: movieid, value: rating
        for movieId, row in neighbor_movie.iteritems():
            jsum, wsum = 0, 0
            for v in row.dropna().iteritems():
                sim = neighbor_dic.get(v[0],0)
                jsum += sim
                wsum += (v[1]*sim)
            ret.append([movieId, wsum/jsum])

        return ret
    
    elif (kind == 'item'):
        neighbor = nearest_neighbor_item(id_num, nn, simFunc)
        
        neighbor_id = [id for id, sim in neighbor]
        
        neighbor_ratedUser = UM_matrix_ds[neighbor_id].dropna(0, how = 'all', thresh = 2)
        
        neighbor_dic = dict(neighbor)
        ret = []
        
        for userId, row in neighbor_ratedUser.iterrows():
            jsum, wsum = 0, 0
            for v in row.dropna().iteritems():
                sim = neighbor_dic.get(v[0], 0)
                jsum += sim
                wsum += (v[1]*sim)
            ret.append([userId, wsum/jsum])
        
        return ret
        

In [50]:
predictRating(1, 100, kind = 'user')
# predictRating(1, 50, kind = 'item')

[[1, 4.302955380538642],
 [2, 4.467070716638078],
 [3, 3.383658116235853],
 [5, 3.1664133012212035],
 [6, 4.0406582909149],
 [7, 3.4752158892419294],
 [9, 2.4514162296451363],
 [10, 3.6270107921232473],
 [11, 3.690714434034328],
 [14, 4.534794580418831],
 [16, 4.795897329635457],
 [17, 4.72226947368303],
 [22, 3.72711108011249],
 [25, 4.751654080789128],
 [29, 3.236757367716795],
 [31, 2.677328397364532],
 [32, 4.224383231106025],
 [34, 3.3629851377349236],
 [36, 4.427349397806007],
 [39, 3.6070154020714225],
 [45, 3.4667482613061607],
 [47, 4.663217073450262],
 [48, 3.036020187788924],
 [50, 4.618318870084806],
 [52, 4.329073569699228],
 [58, 4.286000707756233],
 [60, 3.182930443827066],
 [62, 4.261559672313344],
 [73, 4.0],
 [79, 3.3124843273902287],
 [81, 2.9999999999999996],
 [95, 3.0],
 [104, 3.0909096570513785],
 [110, 4.271190996443581],
 [111, 4.332316423407407],
 [112, 3.4846605995717166],
 [135, 3.055322029735686],
 [140, 3.498489825461306],
 [141, 3.9334607463258737],
 [145,

In [13]:
## view rated movies and movies with high ratings
def ratingMovies(user_id):
    ds = pd.merge(ratings[ratings.userId==user_id], movies, on=['movieId'])
    return ds.sort_values(by=['rating'], ascending=False)[['rating','title','genres','movieId']].head(100)
ratingMovies(1).head()

Unnamed: 0,rating,title,genres,movieId
231,5.0,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War,5060
185,5.0,Excalibur (1981),Adventure|Fantasy,2872
89,5.0,Indiana Jones and the Last Crusade (1989),Action|Adventure,1291
90,5.0,Pink Floyd: The Wall (1982),Drama|Musical,1298
190,5.0,From Russia with Love (1963),Action|Adventure|Thriller,2948


In [15]:
def join_movie_info(predicted_result):
    predicted_ratings = pd.DataFrame(predicted_result, columns=['movieId', 'predicted_rating'])
    result_ds = pd.merge(movies[movies.movieId > 0], predicted_ratings, on=['movieId'])
    return result_ds.sort_values(by=['predicted_rating'], ascending=False)

result_user = predictRating(50, kind = 'user')
result_item = predictRating(1, kind = 'item')
join_movie_info(result_user).head(10)

Unnamed: 0,movieId,title,genres,predicted_rating
57,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.389825
140,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,4.378229
142,1356,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller,4.30683
77,357,Four Weddings and a Funeral (1994),Comedy|Romance,4.238537
118,608,Fargo (1996),Comedy|Crime|Drama|Thriller,4.238281
141,1214,Alien (1979),Horror|Sci-Fi,4.166206
69,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.154359
7,11,"American President, The (1995)",Comedy|Drama|Romance,4.147632
9,16,Casino (1995),Crime|Drama,4.122993
16,36,Dead Man Walking (1995),Crime|Drama,4.086745


In [21]:
join_movie_info(result_item).head(10)

Unnamed: 0,movieId,title,genres,predicted_rating
5,380,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller,5.0
2,249,Immortal Beloved (1994),Drama|Romance,4.4
1,182,Moonlight and Valentino (1995),Drama|Romance,4.096215
3,290,Once Were Warriors (1994),Crime|Drama,4.0
9,603,"Bye Bye, Love (1995)",Comedy,4.0
6,414,"Air Up There, The (1994)",Comedy,3.980591
7,474,In the Line of Fire (1993),Action|Thriller,3.789569
4,318,"Shawshank Redemption, The (1994)",Crime|Drama,3.125
0,140,Up Close and Personal (1996),Drama|Romance,3.116928
8,599,"Wild Bunch, The (1969)",Adventure|Western,2.897704


In [16]:
# 5th user's star rating
userId=1
pd.merge(ratingMovies(userId), join_movie_info(predictRating(userId)), 
         on=['movieId'], how='right').sort_values(by=['predicted_rating'], ascending =False)

Unnamed: 0,rating,title_x,genres_x,movieId,title_y,genres_y,predicted_rating
3,5.0,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,5.000000
12,,,,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,5.000000
0,5.0,American Beauty (1999),Drama|Romance,2858,American Beauty (1999),Drama|Romance,5.000000
13,,,,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,4.886699
6,5.0,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,4.866430
14,,,,3147,"Green Mile, The (1999)",Crime|Drama,4.846440
8,5.0,Gladiator (2000),Action|Adventure|Drama,3578,Gladiator (2000),Action|Adventure|Drama,4.809555
15,,,,5618,Spirited Away (Sen to Chihiro no kamikakushi) ...,Adventure|Animation|Fantasy,4.775069
11,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,4.771741
16,,,,112552,Whiplash (2014),Drama,4.764554


In [53]:
eval_ratings = ratings

def eval_prediction(predict_users, n_users=50, kind = 'user'):
    ## evaluation
    ds = pd.merge(eval_ratings, ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                  on='movieId', how='left')

    ds = ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})
    st = time.time()
    ## udpate to predict_rating 
    user_distance_functions = [('user_euclidean',distance_euclidean), ('user_cosine', distance_cosine)]
    for name, func in user_distance_functions:
        ds[name] = 0
        for user_id in predict_users:
            # key: movie id, value : predicted_rating
            for x in predictRating(user_id, n_users, func):
                ds.loc[(ds.userId==user_id) & (ds.movieId==x[0]),name]=x[1]
    item_distance_functions = [('item_euclidean', distance_euclidean), ('item_cosine', distance_cosine)]
    
    print('runtime', round(time.time()-st,2), 'sec')
    #Only the tuple corresponding to the user performing the predict in the entire dataFrame is returned
    return ds[ds.user_euclidean+ds.user_cosine >0]


In [None]:
## all userId list 
users = UM_matrix_ds.index.tolist()
## 10-person star rating
predicted_user = eval_prediction(users[:2], 100, kind = 'user')
predicted_user

In [None]:
movies = UM_matrix_ds.columns.tolist()
predicted_item = eval_prediction()

In [19]:
predicted = predicted[ (predicted['cosine'] > 0) & (predicted['euclidean'] > 0) ]
def RMSE(X, left_col, right_col):
    return(np.sqrt(np.mean((X[left_col] - X[right_col])**2)))

def MAE(X, left_col, right_col):
    return(np.mean(np.absolute(X[left_col] - X[right_col])) )

for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("MAE of {0} is {1} ".format(name, MAE(predicted, 'rating', name)))

for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("RMSE of {0} is {1} ".format(name, RMSE(predicted, 'rating', name)))

MAE of mean_rating is 0.6732807276979298 
MAE of cosine is 0.5531792589753918 
MAE of euclidean is 0.40421532736958293 
RMSE of mean_rating is 0.8576843201841777 
RMSE of cosine is 0.7377415273102896 
RMSE of euclidean is 0.5665102964641624 
