In [48]:
# coding: utf-8

import pandas as pd
import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as plt
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
from __future__ import print_function
%matplotlib inline  

rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
matplotlib.style.use('ggplot')
pd.options.display.max_rows=14


## 무비 렌즈 데이터로 별점을 예측해 보자 
* User Based 별점 예측 
* Item(Movie) Based 별점 예측

Movie Lens 데이터 로드 
http://grouplens.org/datasets/movielens/

In [49]:
def movieLensDataLoad(type):
    ## user 영화 별점 data 
    ratings = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/ratings.csv")

    ## movie meta(타이트,장르) data 
    movies = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/movies.csv")

    ## user가 영화에 tag를 기입한 data
    tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/tags.csv")
    # tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/ml-20m/tags.csv")
    return ( ratings, movies, tags )

#ratings, movies, tags = movieLensDataLoad('ml-20m')
ratings, movies, tags = movieLensDataLoad('ml-latest-small')

In [50]:
#ratings = pd.read_csv("movieLens/ml-latest-small/ratings.csv")
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,6,2,980730861
1,1,22,3,980731380


In [51]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
...,...,...,...
8563,122756,None But the Brave (1965),Drama|War
8564,123109,P.U.N.K.S (1999),Children|Comedy|Sci-Fi


In [52]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,40,1,animation,1306926135
1,40,1,fantasy,1306926130


### User Based 별점 예측 

U(User) 
M(Movie)

1. U X M vector Matrix를 만든다. 
 key가 userid, value가 { 'movieId':rating } 
2. 나와 비슷한 유저를 찾는다. 

In [59]:
## 1. U X M vector Matrix를 만든다. 
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')

print( "UM Matrix value size", UM_matrix_ds.values.size)
print( "ratings value size", ratings.values.size)


UM Matrix value size 6037712
ratings value size 400092


In [60]:
UM_matrix_ds.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,121271,122284,122495,122756,123109,124857,125916,126407,129454,129651
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,2.0,,,,,...,,,,,,,,,,
2,,4.0,,,,,,,,,...,,,,,,,,,,


In [61]:
## 그럼 이제 최근접 이웃을 찾차 보자
## 
import math
from operator import itemgetter
from scipy.spatial import distance

def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

def disance_corr(a,b):
    return 1-distance.correlation (a,b)

def distance_euclidean(a,b):
    return 1/(distance.euclidean(a,b)+1)

In [62]:
def nearest_neighbor_user( user, topN, simFunc ) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    ## Brote Force Compute
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
            
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
       
        ## At least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        ## similarity functon
        sim = simFunc(interSectionU1,interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    ## top N returned         
    return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

In [63]:
st=time.time()
print(nearest_neighbor_user(8, 50, distance_euclidean))
print(time.time()-st, 'sec')

[(466, 1.0), (421, 1.0), (131, 1.0), (287, 0.6666666666666666), (573, 0.585786437626905), (651, 0.5), (586, 0.5), (412, 0.5), (307, 0.5), (253, 0.5), (53, 0.5), (530, 0.4721359549995794), (317, 0.4721359549995794), (82, 0.4721359549995794), (291, 0.4494897427831781), (290, 0.4494897427831781), (282, 0.4494897427831781), (243, 0.4494897427831781), (13, 0.43050087404306037), (669, 0.4142135623730951), (667, 0.4142135623730951), (495, 0.4142135623730951), (451, 0.4142135623730951), (217, 0.4142135623730951), (171, 0.4142135623730951), (12, 0.4142135623730951), (514, 0.4), (539, 0.3761785115301142), (348, 0.3761785115301142), (695, 0.36602540378443865), (685, 0.36602540378443865), (653, 0.36602540378443865), (629, 0.36602540378443865), (589, 0.36602540378443865), (486, 0.36602540378443865), (465, 0.36602540378443865), (388, 0.36602540378443865), (306, 0.36602540378443865), (300, 0.36602540378443865), (293, 0.36602540378443865), (275, 0.36602540378443865), (261, 0.36602540378443865), (211, 

In [64]:
def predictRating(userid, nn=50, simFunc=distance_euclidean) :
   
    ## neighboorhood 
    neighbor = nearest_neighbor_user(userid,nn,simFunc)
    neighbor_id = [id for id,sim in neighbor]
    
    ## neighboorhood's movie : al least 4 ratings
    neighbor_movie = UM_matrix_ds.loc[neighbor_id]\
                    .dropna(1, how='all', thresh = 4 )
    neighbor_dic = (dict(neighbor))
    ret = [] # ['movieId', 'predictedRate']
    
    ## rating predict by my similarities 
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dic.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
       
    return ret

In [65]:
(predictRating(10, 50))

[[1, 3.7902482995791584],
 [2, 3.7788302655646651],
 [3, 2.9489890166409505],
 [5, 3.7419625820076359],
 [10, 2.7497764923259411],
 [25, 3.2271363730166889],
 [34, 3.9862394442199554],
 [50, 4.6015889505373977],
 [58, 3.5928363128687852],
 [79, 3.0439711535207956],
 [95, 3.5215472352746335],
 [104, 3.2469174320820824],
 [110, 4.0797268894568646],
 [150, 3.6458125556256626],
 [153, 3.4100995855217082],
 [161, 3.4362834417719217],
 [165, 3.3387052277789402],
 [185, 3.2294969673387031],
 [208, 3.5955380683725973],
 [231, 2.4225833112062376],
 [253, 3.1701606146530308],
 [260, 3.2468268935346627],
 [292, 3.5211776650989304],
 [296, 3.865404683450874],
 [316, 3.5572746710810552],
 [318, 4.1192292882856059],
 [329, 3.5081785005444828],
 [339, 3.1342804525965087],
 [342, 3.2524518239753317],
 [344, 2.5742620529244866],
 [349, 3.5635992003540005],
 [356, 3.993261716634708],
 [364, 3.8054580608590767],
 [376, 3.4952385544350855],
 [377, 3.482120718002649],
 [380, 3.1731088524165108],
 [434, 2.8

In [66]:
## user의 별점 매긴 영화와 영화 정보 높은 별점순으로 보기 
def ratingMovies(userid):
    ds = pd.merge(ratings[ratings.userId==userid], movies, on=['movieId'])
    return ds.sort(['rating'],ascending=False)[['rating','title','genres','movieId']].head(100)
ratingMovies(1).head(20)

Unnamed: 0,rating,title,genres,movieId
226,5,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War,5060
45,5,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi,1097
30,5,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,648
33,5,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,780
34,5,"Godfather, The (1972)",Crime|Drama,858
121,5,Poltergeist (1982),Horror|Thriller,1994
202,5,Starman (1984),Adventure|Drama|Romance|Sci-Fi,3699
...,...,...,...,...
28,5,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,593
47,5,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,1129


In [67]:
def join_movie_info( predicted_result ):
    predicted_ratings = pd.DataFrame(predicted_result, columns=['movieId', 'predicted_rating'])
    result_ds = pd.merge( movies[movies.movieId > 0], predicted_ratings, on=['movieId'])
    return result_ds.sort(['predicted_rating'], ascending=False)

result = predictRating(1);
join_movie_info(result)

Unnamed: 0,movieId,title,genres,predicted_rating
58,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,4.793385
74,3578,Gladiator (2000),Action|Adventure|Drama,4.699613
8,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.672419
64,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War,4.660577
71,2858,American Beauty (1999),Comedy|Drama,4.635780
48,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.632719
60,1221,"Godfather: Part II, The (1974)",Crime|Drama,4.509081
...,...,...,...,...
3,10,GoldenEye (1995),Action|Adventure|Thriller,2.910663
20,208,Waterworld (1995),Action|Adventure|Sci-Fi,2.856161


In [68]:
## 5번 유저의 별점 예측 
userid=1
pd.merge(ratingMovies(userid), join_movie_info(predictRating(userid)), on=['movieId'], how='right')\
    .sort(['predicted_rating'], ascending =False)\

Unnamed: 0,rating,title_x,genres_x,movieId,title_y,genres_y,predicted_rating
15,4,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,4.793385
2,5,Gladiator (2000),Action|Adventure|Drama,3578,Gladiator (2000),Action|Adventure|Drama,4.699613
6,5,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.672419
16,,,,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War,4.660577
9,4,American Beauty (1999),Comedy|Drama,2858,American Beauty (1999),Comedy|Drama,4.635780
3,5,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.632719
4,5,"Godfather: Part II, The (1974)",Crime|Drama,1221,"Godfather: Part II, The (1974)",Crime|Drama,4.509081
...,...,...,...,...,...,...,...
72,,,,10,GoldenEye (1995),Action|Adventure|Thriller,2.910663
73,,,,208,Waterworld (1995),Action|Adventure|Sci-Fi,2.856161


In [69]:
eval_ratings = ratings

In [70]:
# ratings['userId'].drop_duplicates().values[:]
def eval_prediction( predict_users,  n_users=50 ):
    ## evaluation
    ds = pd.merge(eval_ratings, 
                       ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                       on='movieId', how='left')

    ds = ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})

    st = time.time()
    ## udpate to predict_rating 
    distance_functions = [ ('euclidean',distance_euclidean), ('cosine', distance_cosine) ]
    for name, func in distance_functions:
        ds[name] = 0
        for userId in predict_users:
            for x in predictRating(userId, n_users, func):
                ds.loc[(ds.userId==userId) & (ds.movieId==x[0]),name]=x[1]
    print('elapsed', round(time.time()-st,2), 'sec')
    return ds[ds.euclidean+ds.cosine>0]

In [71]:
## 전체 userId list 
users = UM_matrix_ds.index.tolist()

In [72]:
users[:2]

[1, 2]

In [73]:
## 10명 별점 예측
predicted = eval_prediction(users[:2], 100 )

elapsed 7.5 sec


In [80]:
predicted = predicted[ (predicted['cosine'] > 0) & (predicted['euclidean'] > 0) ]

In [81]:
def RMSE(X, left_col, right_col):
    return(np.sqrt(np.mean( (X[left_col] - X[right_col])**2 )))

def MAE(X, left_col, right_col):
    return(np.mean(np.absolute(X[left_col] - X[right_col])) )

In [87]:
MAE( predicted, 'rating', 'cosine')

0.50124020088345789

In [88]:
MAE( predicted, 'rating', 'euclidean')

0.43223181067073418

In [89]:
MAE( predicted, 'rating', 'mean_rating')

0.71598663772940452

In [91]:
for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("MAE of {0} is {1} ".format(name, MAE( predicted, 'rating', name )))

for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("RMSE of {0} is {1} ".format(name, RMSE( predicted, 'rating', name )))

MAE of mean_rating is 0.715986637729 
MAE of cosine is 0.501240200883 
MAE of euclidean is 0.432231810671 
RMSE of mean_rating is 0.861539963449 
RMSE of cosine is 0.593503147367 
RMSE of euclidean is 0.522094960976 


In [33]:
predicted = eval_prediction(users[:2], 20 )

elapsed 3.33 sec
