In [137]:
# coding: utf-8

import pandas as pd
import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as plt
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
%matplotlib inline  

rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
matplotlib.style.use('ggplot')
pd.options.display.max_rows=14


## 무비 렌즈 데이터로 별점을 예측해 보자 
* User Based 별점 예측 
* Item(Movie) Based 별점 예측

Movie Lens 데이터 로드 
http://grouplens.org/datasets/movielens/

In [4]:
def movieLensDataLoad(type):
    ## user 영화 별점 data 
    ratings = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/ratings.csv")

    ## movie meta(타이트,장르) data 
    movies = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/movies.csv")

    ## user가 영화에 tag를 기입한 data
    tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/tags.csv")
    # tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/ml-20m/tags.csv")
    return ( ratings, movies, tags )

#ratings, movies, tags = movieLensDataLoad('ml-20m')
ratings, movies, tags = movieLensDataLoad('ml-latest-small')

In [5]:
#ratings = pd.read_csv("movieLens/ml-latest-small/ratings.csv")
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,6,2,980730861
1,1,22,3,980731380


In [6]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [7]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,40,1,animation,1306926135
1,40,1,fantasy,1306926130


### User Based 별점 예측 

U(User) 
M(Movie)

1. U X M vector Matrix를 만든다. 
 key가 userid, value가 { 'movieId':rating } 
2. 나와 비슷한 유저를 찾는다. 

In [138]:
## 1. U X M vector Matrix를 만든다. 
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')
print( "value size", UM_matrix_ds.size)

value size 6037712


In [139]:
UM_matrix_ds.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,121271,122284,122495,122756,123109,124857,125916,126407,129454,129651
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,2.0,,,,,...,,,,,,,,,,
2,,4.0,,,,,,,,,...,,,,,,,,,,


In [140]:
## 그럼 이제 최근접 이웃을 찾차 보자
## 
import math
from operator import itemgetter
from scipy.spatial import distance

def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

def disance_corr(a,b):
    return 1-distance.correlation (a,b)

def distance_euclidean(a,b):
    return 1/(distance.euclidean(a,b)+1)

In [141]:
def nearest_neighbor_user( user, topN, simFunc ) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    ## Brote Force Compute
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
       
    ## At least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        ## similarity functon
        sim = simFunc(interSectionU1,interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    ## top N returned         
    return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

In [153]:
st=time.time()
(nearest_neighbor_user(8,50,distance_euclidean))
print(time.time()-st, 'sec')

0.976809024810791 sec


In [143]:
def predictRating(userid, nn=100, simFunc=distance_euclidean) :
   
    ## neighboorhood 
    neighbor = nearest_neighbor_user(userid,nn,simFunc)
    neighbor_id = [id for id,sim in neighbor]
    
    ## neighboorhood's movie : al least 4 ratings
    neighbor_movie = UM_matrix_ds.loc[neighbor_id]\
                    .dropna(1, how='all', thresh = 4 )
    neighbor_dic = (dict(neighbor))
    ret = [] # ['movieId', 'predictedRate']
    
    ## rating predict by my similarities 
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dic.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
       
    return ret

In [144]:
predictRating(10,20)

[[296, 3.9328660147739645],
 [318, 4.0854895741569583],
 [356, 3.6794503844361568],
 [589, 3.4657907930148593],
 [593, 3.6891805603309069],
 [780, 3.8192732584744782],
 [2762, 4.5837679362886856]]

In [145]:
## user의 별점 매긴 영화와 영화 정보 높은 별점순으로 보기 
def ratingMovies(userid):
    ds = pd.merge(ratings[ratings.userId==userid], movies, on=['movieId'])
    return ds.sort(['rating'],ascending=False)[['rating','title','genres','movieId']].head(100)
ratingMovies(1).head(20)

Unnamed: 0,rating,title,genres,movieId
226,5,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War,5060
45,5,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi,1097
30,5,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,648
33,5,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,780
34,5,"Godfather, The (1972)",Crime|Drama,858
121,5,Poltergeist (1982),Horror|Thriller,1994
202,5,Starman (1984),Adventure|Drama|Romance|Sci-Fi,3699
...,...,...,...,...
28,5,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,593
47,5,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,1129


In [146]:
def join_movie_info( predicted_result ):
    predicted_ratings = pd.DataFrame(predicted_result, columns=['movieId', 'predicted_rating'])
    result_ds = pd.merge( movies[movies.movieId > 0], predicted_ratings, on=['movieId'])
    return result_ds.sort(['predicted_rating'], ascending=False)

result = predictRating(1);
join_movie_info(result)

Unnamed: 0,movieId,title,genres,predicted_rating
106,1080,Monty Python's Life of Brian (1979),Comedy,4.915104
131,2019,Seven Samurai (Shichinin no samurai) (1954),Action|Adventure|Drama,4.875395
134,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War,4.660577
16,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.629133
73,497,Much Ado About Nothing (1993),Comedy|Romance,4.611773
174,3578,Gladiator (2000),Action|Adventure|Drama,4.611430
193,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,4.567527
...,...,...,...,...
55,344,Ace Ventura: Pet Detective (1994),Comedy,2.442684
160,2805,Mickey Blue Eyes (1999),Comedy|Romance,2.357886


In [147]:
## 5번 유저의 별점 예측 
userid=1
pd.merge(ratingMovies(userid), join_movie_info(predictRating(userid)), on=['movieId'], how='right')\
    .sort(['predicted_rating'], ascending =False)\

Unnamed: 0,rating,title_x,genres_x,movieId,title_y,genres_y,predicted_rating
24,,,,1080,Monty Python's Life of Brian (1979),Comedy,4.915104
25,,,,2019,Seven Samurai (Shichinin no samurai) (1954),Action|Adventure|Drama,4.875395
26,,,,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War,4.660577
10,5,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.629133
27,,,,497,Much Ado About Nothing (1993),Comedy|Romance,4.611773
4,5,Gladiator (2000),Action|Adventure|Drama,3578,Gladiator (2000),Action|Adventure|Drama,4.611430
28,,,,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,4.567527
...,...,...,...,...,...,...,...
200,,,,344,Ace Ventura: Pet Detective (1994),Comedy,2.442684
201,,,,2805,Mickey Blue Eyes (1999),Comedy|Romance,2.357886


In [148]:
eval_ratings = ratings

In [149]:
## evaluation
eval_ds = pd.merge(eval_ratings, 
                   ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                   on='movieId', how='left')

eval_ds = eval_ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})

In [150]:
# ratings['userId'].drop_duplicates().values[:]
def eval_prediction( predict_users, eval_ds, n_users=50 ):
    st = time.time()
    ## udpate to predict_rating 
    distance_functions = [ ('euclidean',distance_euclidean), ('cosine', distance_cosine) ]
    for name, func in distance_functions:
        eval_ds[name] = 0
        for userId in predict_users:
            for x in predictRating(userId, n_users, func):
                eval_ds.loc[(eval_ds.userId==userId) & (eval_ds.movieId==x[0]),name]=x[1]
    print('elapsed', round(time.time()-st,2), 'sec')
    return eval_ds[eval_ds.euclidean+eval_ds.cosine>0]



In [151]:
## 전체 userId list 
users = UM_matrix_ds.index.tolist()

In [152]:
## 10명 별점 예측
predicted = eval_prediction(users[:10], eval_ds, 50 )

elapsed 42.11 sec


In [135]:
eval_ds[eval_ds.cosine > 0]

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,euclidean,cosine
3,1,50,5.0,980732037,4.368201,4.672419,4.832862
4,1,110,4.0,980730408,4.037671,4.006801,4.142039
7,1,260,5.0,980730769,4.196078,4.399413,4.357207
8,1,296,4.0,980731208,4.137615,4.172790,4.332957
10,1,318,3.0,980731417,4.442073,4.138711,4.362506
17,1,457,4.0,980730816,3.998000,4.366333,4.374436
19,1,480,4.0,980731903,3.601852,4.034469,4.000755
...,...,...,...,...,...,...,...
238,2,736,3.0,1091931476,3.299419,3.373706,3.299044
240,2,858,4.0,1091931453,4.334135,4.487687,4.317464


In [134]:
def RMSE(X, left_col, right_col):
    return(np.sqrt(np.mean((X[left_col] - X[right_col])**2)))

def MAE(X, left_col, right_col):
    return(np.mean(np.absolute(X[left_col] - X[right_col])) )


for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("MAE of {0} is {1} ".format(name, MAE( eval_ds[eval_ds[name] > 0], 'rating', name )))

for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("RMSE of {0} is {1} ".format(name, RMSE( eval_ds[eval_ds[name] > 0], 'rating', name )))


MAE of mean_rating is 0.7055168840093711 
MAE of cosine is 0.4787288685897648 
MAE of euclidean is 0.4201123805010063 
RMSE of mean_rating is 0.920840633867146 
RMSE of cosine is 0.56832980127156 
RMSE of euclidean is 0.5015339342428957 
