In [1]:
# coding: utf-8
from __future__ import print_function
import pandas as pd
import numpy as np
from matplotlib import rcParams
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import networkx as nx
from datetime import datetime
import gensim, logging
import matplotlib.patches as mpatches
import matplotlib
import time

%matplotlib inline  

rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
matplotlib.style.use('ggplot')
sns.set_style("whitegrid")

## 무비 렌즈 데이터로 별점을 예측해 보자 
* User Based 별점 예측 
* Item(Movie) Based 별점 예측

Movie Lens 데이터 로드 
http://grouplens.org/datasets/movielens/

In [2]:
def movieLensDataLoad(type):
    ## user 영화 별점 data 
    ratings = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/ratings.csv")

    ## movie meta(타이트,장르) data 
    movies = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/movies.csv")

    ## user가 영화에 tag를 기입한 data
    tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/tags.csv")
    # tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/ml-20m/tags.csv")
    return ( ratings, movies, tags )

#ratings, movies, tags = movieLensDataLoad('ml-20m')
ratings, movies, tags = movieLensDataLoad('ml-latest-small')

In [3]:
#ratings = pd.read_csv("movieLens/ml-latest-small/ratings.csv")
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,6,2,980730861
1,1,22,3,980731380
2,1,32,2,980731926
3,1,50,5,980732037
4,1,110,4,980730408
5,1,164,3,980731766
6,1,198,3,980731282
7,1,260,5,980730769
8,1,296,4,980731208
9,1,303,3,980732235


In [4]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
tags.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,40,1,animation,1306926135
1,40,1,fantasy,1306926130
2,40,1,Pixar,1306926133
3,40,47,dark,1306930201
4,40,47,disturbing,1306930291
5,40,47,thriller,1306930308
6,40,101,off-beat comedy,1307009664
7,40,101,Wes Anderson,1307009667
8,40,208,post-apocalyptic,1306930611
9,40,208,sci-fi,1306930614


In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100023.0,100023.0,100023.0,100023.0
mean,341.760765,8613.123442,3.491362,1091520000.0
std,193.849755,19736.006106,1.067942,163357700.0
min,1.0,1.0,0.5,828504900.0
25%,180.0,783.0,3.0,960670600.0
50%,343.0,2081.0,3.5,1097974000.0
75%,511.0,4367.0,4.0,1205150000.0
max,706.0,129651.0,5.0,1427755000.0


### User Based 별점 예측 

U(User) 
M(Movie)

1. U X M vector Matrix를 만든다. 
 key가 userid, value가 { 'movieId':rating } 
2. 나와 비슷한 유저를 찾는다. 

In [7]:
## 1. U X M vector Matrix를 만든다. 
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')

In [8]:
UM_matrix_ds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,121271,122284,122495,122756,123109,124857,125916,126407,129454,129651
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,2.0,,,,,...,,,,,,,,,,
2,,4.0,,,,,,,,,...,,,,,,,,,,
3,,,4.0,,4.0,,4.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [9]:
## 그럼 이제 최근접 이웃을 찾차 보자
## 
import math
from operator import itemgetter
from scipy.spatial import distance

def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

def disance_corr(a,b):
    return 1-distance.correlation (a,b)

def distance_euclidean(a,b):
    return 1/(distance.euclidean(a,b)+1)

In [10]:
def nearest_neighbor_user( user, topN, simFunc ) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    ## Brote Force Compute
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
       
    ## At least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        ## similarity functon
        sim = simFunc(interSectionU1,interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    ## top N returned         
    return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

In [11]:
st=time.time()
(nearest_neighbor_user(8,50,distance_euclidean))
print(time.time()-st, 'sec')

0.956270933151 sec


In [12]:
## 클러스터링 된 결과와 그냥 했을때의 성능 비교
## 결과 비교 

In [13]:
from sklearn.cluster import KMeans

class UserKmeansCluster:
    def __init__(self, matrix_ds):
        self.result = 0
        self.matrix_ds = matrix_ds
        self.user_size = matrix_ds.index.size
        
    def clustering(self):
        ## cluster k 결정 
        self.K = int(math.ceil(math.sqrt( UM_matrix_ds.index.size / 2)))
        
        ## clustering by kmeans
        X = self.matrix_ds.fillna(0).values
        self.cluster = KMeans(n_clusters=self.K)
        self.cluster.fit(X)
        
        ## cluster result 
        self.cluster_members = defaultdict(list)
        self.user2cid = defaultdict(int)
        for user, cid in zip(self.matrix_ds.index, self.cluster.labels_):
            self.cluster_members[cid].append(user)
            self.user2cid[user] = cid
        
    def user2cluster(self, user):
        return self.user2cid.get(user,-1)

    def get_members(self, cid):
        return self.cluster_members.get(cid,[])

    def user2members(self, user):
        cid = self.user2cluster(user)
        return self.get_members(cid)

        
usercluster = UserKmeansCluster(UM_matrix_ds[:1000])
usercluster.clustering()

In [14]:
## graph cut 

In [15]:
def graph_cut( ds, threshold=3 ):
    grouped = ds.groupby('movieId')
    userlink = defaultdict(int)
    for name, g in grouped:
        users = g['userId'].values
        cnt = users.size
        if cnt<2:
            continue

        for n in range(0,cnt):
            for m in range(n+1,cnt):
                key = "{0}:{1}" if users[n]<users[m] else "{1}:{0}"
                key = key.format(users[n], users[m])
                userlink[key] = userlink[key] + 1
                
    neighbors = defaultdict(list)
    n=0
    for k, v in userlink.items():
        if (v) > threshold:
            nm = k.split(':')
            if len(nm) == 2:
                neighbors[nm[0]].append(nm[1])
                neighbors[nm[1]].append(nm[0])
                n = n+1
                
    print("user link수 : ", n)
    return neighbors

neighbors = graph_cut(ratings)

user link수 :  152655


In [16]:
## 평균 neightbor
total = 0
cnt = 0
for k,v in neighbors.items():
    cnt = cnt + 1
    total = total + len(v)


In [17]:
def predictRating(userid, nn=100, simFunc=distance_cosine) :
   
    ## neighboorhood 
    neighbor = nearest_neighbor_user(userid,nn,simFunc)
    neighbor_id = [id for id,sim in neighbor]
    
    ## neighboorhood's movie : al least 4 ratings
    neighbor_movie = UM_matrix_ds.loc[neighbor_id]\
                    .dropna(1, how='all', thresh = 4 )
    neighbor_dic = (dict(neighbor))
    ret = [] # ['movieId', 'predictedRate']
    
    ## rating predict by my similarities 
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dic.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
       
    return ret

In [18]:
def ratingMovies(userid):
    return pd.merge(ratings[ratings.userId==userid], movies, on=['movieId'])

In [19]:
import time
gtime=time.time()
def elapse(string=""):
    global gtime
    now = time.time()
    if string!="":
        print ("{1} : {0} sec".format(now - gtime, string))
    gtime = now

In [20]:
userid=12
pd.options.display.max_rows=14
ratingMovies(userid).sort(['rating'],ascending=False)[['rating','title','genres']].head(100)

Unnamed: 0,rating,title,genres
2,5,Vertigo (1958),Drama|Mystery|Romance|Thriller
1,5,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
13,4,"Importance of Being Earnest, The (2002)",Comedy|Drama|Romance
15,4,"Bourne Identity, The (2002)",Action|Mystery|Thriller
3,4,One Flew Over the Cuckoo's Nest (1975),Drama
24,4,"Last Seduction, The (1994)",Crime|Drama|Thriller
20,4,Austin Powers in Goldmember (2002),Comedy
...,...,...,...
11,2,Unfaithful (2002),Drama|Thriller
18,2,Lovely & Amazing (2001),Comedy|Drama|Romance


In [21]:
userid=1
predicted_ratings = pd.DataFrame(predictRating(userid),columns=['movieId', 'predicted_rating'])\
    .sort(['predicted_rating'], ascending=False)
pd.merge( movies[movies.movieId > 0], predicted_ratings, on=['movieId'])\
    [['predicted_rating', 'title', 'genres']]\
    .sort(['predicted_rating'], ascending=False)

Unnamed: 0,predicted_rating,title,genres
36,4.800232,Rob Roy (1995),Action|Drama|Romance|War
154,4.750635,Rear Window (1954),Mystery|Thriller
192,4.697819,Groundhog Day (1993),Comedy|Fantasy|Romance
61,4.659233,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
22,4.658856,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
198,4.626185,Jaws (1975),Action|Horror
190,4.625130,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War
...,...,...,...
232,2.500746,"General's Daughter, The (1999)",Crime|Drama|Mystery|Thriller
236,2.497320,"Haunting, The (1999)",Horror|Thriller


In [22]:
a = ratingMovies(userid)

In [23]:
predictRating(userid)


pd.DataFrame(predictRating(userid),columns=['movieId', 'predicted_rating'])

Unnamed: 0,movieId,predicted_rating
0,1,4.024628
1,2,3.750915
2,3,3.218328
3,5,3.573849
4,6,3.424746
5,7,3.498340
6,10,3.685055
...,...,...
307,64957,2.501615
308,72998,3.747347


In [24]:
## 5번 유저의 별점 예측 
#elapse()
userid=5

pd.merge( ratingMovies(userid), 
         pd.DataFrame(predictRating(userid),columns=['movieId', 'predicted_rating']),on=['movieId'], how='outer')\
        .sort(['predicted_rating'], ascending =False).dropna()
#print (predictRating(userid,20,distance_euclidean))
#elapse("end")

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,predicted_rating
1,5,318,3.5,1230545057,"Shawshank Redemption, The (1994)",Crime|Drama,4.625853
0,5,50,4.5,1230545117,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.500493
4,5,858,4.5,1230545060,"Godfather, The (1972)",Crime|Drama,4.492323
13,5,2959,4.0,1230550119,Fight Club (1999),Action|Crime|Drama|Thriller,4.423648
31,5,60069,4.0,1230544505,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi,4.284680
2,5,613,4.0,1230543957,Jane Eyre (1996),Drama|Romance,4.005063
26,5,31878,3.0,1230682787,Kung Fu Hustle (Gong fu) (2004),Action|Comedy,4.001054
...,...,...,...,...,...,...,...
29,5,51662,3.0,1230544933,300 (2007),Action|Fantasy|War|IMAX,3.739620
12,5,2858,1.5,1230550100,American Beauty (1999),Comedy|Drama,3.675548


## 성능평가 : RMSE 값으로 확인 
* 비 개인화의 RMSE 값 : 평균 별점을 예측
* 개인화된 RMSE값 비교 Item Based
* 데이터가 늘어남에 따라 어떻게 성능이 좋아지는?



In [25]:
eval_ratings = ratings

In [26]:
## evaluation

eval_ds = pd.merge(eval_ratings, 
                   ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                   on='movieId', how='left')

eval_ds = eval_ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})
eval_ds['predict_rating'] = 0

In [27]:
eval_ds['predict_rating'] = 0
## udpate to predict_rating 
distance_functions = [ ('euclidean',distance_euclidean), ('cosine', distance_cosine), ('pearson', disance_corr) ]

for name, func in distance_functions:
    eval_ds[name] = 0
    for userId in ratings['userId'].drop_duplicates().values[:10]:
        for x in predictRating(userId,20,func):
            eval_ds.loc[(eval_ds.userId==userId) & (eval_ds.movieId==x[0]),name]=x[1]

In [28]:
eval_ds[eval_ds.cosine > 0]

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,predict_rating,euclidean,cosine,pearson
415,2,50,4.0,1091931562,4.368201,0,0.000000,4.499704,4.501111
416,5,50,4.5,1230545117,4.368201,0,4.407479,4.633699,4.808385
654,3,110,5.0,901232920,4.037671,0,4.622014,4.499307,4.598562
655,7,110,5.0,834624283,4.037671,0,4.835609,5.000000,4.674455
658,10,110,3.5,1113488399,4.037671,0,0.000000,4.250712,0.000000
1026,8,260,5.0,964735903,4.196078,0,4.886730,4.714679,4.741254
1027,10,260,2.5,1113487625,4.196078,0,0.000000,3.249533,3.289084
...,...,...,...,...,...,...,...,...,...
30020,6,2762,3.5,1422029308,3.924020,0,3.864773,4.249940,3.792831
30022,10,2762,4.0,1113487751,3.924020,0,4.583768,4.583767,0.000000


In [29]:
def RMSE(X, left_col, right_col):
    return(np.sqrt(np.mean((X[left_col] - X[right_col])**2)))

def MAE(X, left_col, right_col):
    return(np.mean(np.absolute(X[left_col] - X[right_col])) )

distance_functions = [ ('euclidean',distance_euclidean), ('cosine', distance_cosine), ('pearson', disance_corr) ]

print ("MAE of {0} is {1}".format('mean rating', MAE( eval_ds[eval_ds['mean_rating'] > 0], 'rating', 'mean_rating' )))
for (name,func) in distance_functions:
    print ("MAE of {0} is {1} ".format(name, MAE( eval_ds[eval_ds[name] > 0], 'rating', name )))

print ("RMSE of {0} is {1}".format('mean rating', RMSE( eval_ds[eval_ds['mean_rating'] > 0], 'rating', 'mean_rating' )))
for (name,func) in distance_functions:
    print ("RMSE of {0} is {1} ".format(name, RMSE( eval_ds[eval_ds[name] > 0], 'rating', name )))


MAE of mean rating is 0.705516884009
MAE of euclidean is 0.202162582787 
MAE of cosine is 0.445364633853 
MAE of pearson is 0.487945810423 
RMSE of mean rating is 0.920840633867
RMSE of euclidean is 0.280518928251 
RMSE of cosine is 0.524926376986 
RMSE of pearson is 0.623947327793 


In [30]:
eval_ds.count()

userId            100023
movieId           100023
rating            100023
timestamp         100023
mean_rating       100023
predict_rating    100023
euclidean         100023
cosine            100023
pearson           100023
dtype: int64