In [1]:
# coding: utf-8

import pandas as pd
import numpy as np
from matplotlib import rcParams
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import networkx as nx
from datetime import datetime
import gensim, logging
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline  

rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
matplotlib.style.use('ggplot')
sns.set_style("whitegrid")

## 무비 렌즈 데이터로 별점을 예측해 보자 
* User Based 별점 예측 
* Item(Movie) Based 별점 예측

Movie Lens 데이터 로드 
http://grouplens.org/datasets/movielens/

In [2]:
def movieLensDataLoad(type):
    ## user 영화 별점 data 
    ratings = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/ratings.csv")

    ## movie meta(타이트,장르) data 
    movies = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/movies.csv")

    ## user가 영화에 tag를 기입한 data
    tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/tags.csv")
    # tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/ml-20m/tags.csv")
    return ( ratings, movies, tags )

#ratings, movies, tags = movieLensDataLoad('ml-20m')
ratings, movies, tags = movieLensDataLoad('ml-latest-small')

In [3]:
#ratings = pd.read_csv("movieLens/ml-latest-small/ratings.csv")
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,6,2,980730861
1,1,22,3,980731380
2,1,32,2,980731926
3,1,50,5,980732037
4,1,110,4,980730408
5,1,164,3,980731766
6,1,198,3,980731282
7,1,260,5,980730769
8,1,296,4,980731208
9,1,303,3,980732235


In [11]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [14]:
tags.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,40,1,animation,1306926135
1,40,1,fantasy,1306926130
2,40,1,Pixar,1306926133
3,40,47,dark,1306930201
4,40,47,disturbing,1306930291
5,40,47,thriller,1306930308
6,40,101,off-beat comedy,1307009664
7,40,101,Wes Anderson,1307009667
8,40,208,post-apocalyptic,1306930611
9,40,208,sci-fi,1306930614


In [15]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100023.0,100023.0,100023.0,100023.0
mean,341.760765,8613.123442,3.491362,1091520000.0
std,193.849755,19736.006106,1.067942,163357700.0
min,1.0,1.0,0.5,828504900.0
25%,180.0,783.0,3.0,960670600.0
50%,343.0,2081.0,3.5,1097974000.0
75%,511.0,4367.0,4.0,1205150000.0
max,706.0,129651.0,5.0,1427755000.0


### User Based 별점 예측 

U(User) 
M(Movie)

1. U X M vector Matrix를 만든다. 
 key가 userid, value가 { 'movieId':rating } 
2. 나와 비슷한 유저를 찾는다. 

In [4]:
## 1. U X M vector Matrix를 만든다. 
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')

In [57]:
UM_matrix_ds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,121271,122284,122495,122756,123109,124857,125916,126407,129454,129651
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,2.0,,,,,...,,,,,,,,,,
2,,4.0,,,,,,,,,...,,,,,,,,,,
3,,,4.0,,4.0,,4.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [114]:
## 그럼 이제 최근접 이웃을 찾차 보자
## 
import math
from operator import itemgetter
from scipy.spatial import distance

def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

def disance_corr(a,b):
    return 1-distance.correlation (a,b)

def distance_euclidean(a,b):
    return 1/(distance.euclidean(a,b)+1)

In [48]:
def nearest_neighbor_user( user, topN, simFunc ) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    ## Brote Force Compute
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
       
    ## At least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        ## similarity functon
        sim = simFunc(interSectionU1,interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    ## top N returned         
    return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

In [193]:
st=time.time()
(nearest_neighbor_user(8,50,distance_euclidean))
print(time.time()-st, 'sec')

1.0844941139221191 sec


In [84]:
## 클러스터링 된 결과와 그냥 했을때의 성능 비교
## 결과 비교 

In [181]:
from sklearn.cluster import KMeans

class UserKmeansCluster:
    def __init__(self, matrix_ds):
        self.result = 0
        self.matrix_ds = matrix_ds
        self.user_size = matrix_ds.index.size
        
    def clustering(self):
        ## cluster k 결정 
        self.K = math.ceil(math.sqrt( UM_matrix_ds.index.size / 2))
        
        ## clustering by kmeans
        X = self.matrix_ds.fillna(0).values
        self.cluster = KMeans(n_clusters=self.K)
        self.cluster.fit(X)
        
        ## cluster result 
        self.cluster_members = defaultdict(list)
        self.user2cid = defaultdict(int)
        for user, cid in zip(self.matrix_ds.index, self.cluster.labels_):
            self.cluster_members[cid].append(user)
            self.user2cid[user] = cid
        
    def user2cluster(self, user):
        return self.user2cid.get(user,-1)

    def get_members(self, cid):
        return self.cluster_members.get(cid,[])

    def user2members(self, user):
        cid = self.user2cluster(user)
        return self.get_members(cid)

        
usercluster = UserKmeansCluster(UM_matrix_ds[:1000])
usercluster.clustering()

In [None]:
## graph cut 

In [188]:
def graph_cut( ds, threshold=3 ):
    grouped = ds.groupby('movieId')
    userlink = defaultdict(int)
    for name, g in grouped:
        users = g['userId'].values
        cnt = users.size
        if cnt<2:
            continue

        for n in range(0,cnt):
            for m in range(n+1,cnt):
                key = "{0}:{1}" if users[n]<users[m] else "{1}:{0}"
                key = key.format(users[n], users[m])
                userlink[key] = userlink[key] + 1
                
    print("user link수 : ",len(userlink))
    
    neighbors = defaultdict(list)
    for k, v in userlink.items():
        if (v) > threshold:
            nm = k.split(':')
            if len(nm) == 2:
                neighbors[nm[0]].append(nm[1])
                neighbors[nm[1]].append(nm[0])
    return neighbors

neighbors = graph_cut(ratings)
    #print(name, users)

user link수 :  218559


In [190]:
## 평균 neightbor
total = 0
cnt = 0
for k,v in neighbors.items():
    cnt = cnt + 1
    total = total + len(v)


In [18]:
def predictRating(userid, nn=100, simFunc=distance_cosine) :
   
    ## neighboorhood 
    neighbor = nearest_neighbor_user(userid,nn,simFunc)
    neighbor_id = [id for id,sim in neighbor]
    
    ## neighboorhood's movie : al least 4 ratings
    neighbor_movie = UM_matrix_ds.loc[neighbor_id]\
                    .dropna(1, how='all', thresh = 4 )
    neighbor_dic = (dict(neighbor))
    ret = [] # ['movieId', 'predictedRate']
    
    ## rating predict by my similarities 
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dic.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
       
    return ret

In [19]:
def ratingMovies(userid):
    return pd.merge(ratings[ratings.userId==userid], movies, on=['movieId'])

In [21]:
import time
gtime=time.time()
def elapse(string=""):
    global gtime
    now = time.time()
    if string!="":
        print ("{1} : {0} sec".format(now - gtime, string))
    gtime = now

In [197]:
userid=12
pd.options.display.max_rows=14
ratingMovies(userid).sort(['rating'],ascending=False)[['rating','title','genres']].head(100)

Unnamed: 0,rating,title,genres
2,5,Vertigo (1958),Drama|Mystery|Romance|Thriller
1,5,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
13,4,"Importance of Being Earnest, The (2002)",Comedy|Drama|Romance
15,4,"Bourne Identity, The (2002)",Action|Mystery|Thriller
3,4,One Flew Over the Cuckoo's Nest (1975),Drama
24,4,"Last Seduction, The (1994)",Crime|Drama|Thriller
20,4,Austin Powers in Goldmember (2002),Comedy
...,...,...,...
11,2,Unfaithful (2002),Drama|Thriller
18,2,Lovely & Amazing (2001),Comedy|Drama|Romance


In [198]:
userid=1
predicted_ratings = pd.DataFrame(predictRating(userid),columns=['movieId', 'predicted_rating'])\
    .sort(['predicted_rating'], ascending=False)
pd.merge( movies[movies.movieId > 0], predicted_ratings, on=['movieId'])\
    [['predicted_rating', 'title', 'genres']]\
    .sort(['predicted_rating'], ascending=False)

Unnamed: 0,predicted_rating,title,genres
36,4.800232,Rob Roy (1995),Action|Drama|Romance|War
154,4.750635,Rear Window (1954),Mystery|Thriller
192,4.697819,Groundhog Day (1993),Comedy|Fantasy|Romance
61,4.659233,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
22,4.658856,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
198,4.626185,Jaws (1975),Action|Horror
190,4.625130,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War
...,...,...,...
232,2.500746,"General's Daughter, The (1999)",Crime|Drama|Mystery|Thriller
236,2.497320,"Haunting, The (1999)",Horror|Thriller


In [199]:
## 5번 유저의 별점 예측 
elapse()
userid=5

pd.merge(ratingMovies(userid),predictRating(userid),on=['movieId'], how='outer').sort(['predictedRate'], ascending =False)
#print (predictRating(userid,20,distance_euclidean))
elapse("end")

TypeError: list indices must be integers, not str

In [200]:
print (predictRating(userid,20,distance_euclidean))


[[1, 3.2247194110570208], [3, 2.9999999999999996], [6, 4.0970605549331562], [17, 3.5408838538294267], [19, 2.7187638431597523], [25, 3.9255822102693072], [29, 3.5113718877199043], [32, 3.9995634839760172], [34, 3.371165753571542], [36, 4.0], [39, 3.3674238341453266], [41, 3.7500000000000004], [43, 3.607311265295952], [47, 3.8514368894422524], [50, 4.4074787578950767], [95, 3.1843729383129085], [110, 4.192130921702562], [150, 3.8736751083781527], [151, 3.0000000000000004], [161, 4.1894494688778821], [162, 3.9772562245601919], [165, 3.1944028307437913], [168, 3.3979879511384099], [170, 2.517749202644787], [198, 3.267057831579856], [231, 2.6932485551908054], [253, 3.088560332950459], [260, 3.880976547208951], [265, 3.8133900323978978], [293, 3.7931536198802185], [296, 4.2330700904745777], [316, 3.2531680728000629], [318, 4.2555561499772638], [329, 2.3823015073603422], [344, 3.0258523677212783], [345, 3.7882677709539339], [349, 3.3846716882615424], [356, 4.1336654513595414], [357, 3.920759

## 성능평가 : RMSE 값으로 확인 
* 비 개인화의 RMSE 값 : 평균 별점을 예측
* 개인화된 RMSE값 비교 Item Based
* 데이터가 늘어남에 따라 어떻게 성능이 좋아지는?



In [217]:
eval_ratings = ratings

In [218]:
## evaluation

eval_ds = pd.merge(eval_ratings, 
                   ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                   on='movieId', how='left')

eval_ds = eval_ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})
eval_ds['predict_rating'] = 0

In [219]:
eval_ds['predict_rating'] = 0
## udpate to predict_rating 
distance_functions = [ ('euclidean',distance_euclidean), ('cosine', distance_cosine) ]

for name, func in distance_functions:
    eval_ds[name] = 0
    for userId in ratings['userId'].drop_duplicates().values[:10]:
        for x in predictRating(userId,20,func):
            eval_ds.loc[(eval_ds.userId==userId) & (eval_ds.movieId==x[0]),name]=x[1]

In [226]:
eval_ds[eval_ds.cosine > 0]

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,predict_rating,euclidean,cosine
8,1,296,4.0,980731208,4.137615,0,0.000000,4.249598
28,1,593,5.0,980731433,4.235905,0,5.000000,4.500049
33,1,780,5.0,980731988,3.469008,0,0.000000,4.000633
150,1,2571,4.0,980730794,4.207547,0,0.000000,4.624907
230,2,50,4.0,1091931562,4.368201,0,0.000000,4.499704
234,2,364,5.0,1091931487,3.742021,0,4.505555,4.167445
240,2,858,4.0,1091931453,4.334135,0,4.396250,4.083094
...,...,...,...,...,...,...,...,...
1062,10,260,2.5,1113487625,4.196078,0,0.000000,3.249533
1066,10,296,4.0,1113487658,4.137615,0,3.932866,4.399956


In [825]:
def RMSE(X, left_col, right_col):
    return(np.sqrt(np.mean((X[left_col] - X[right_col])**2)))

def MAE(X, left_col, right_col):
    return(np.mean(np.absolute(X[left_col] - X[right_col])) )

distance_functions = [ ('euclidean',distance_euclidean), ('cosine', distance_cosine), ('pearson', disance_corr) ]

print ("MAE of {0} is {1}".format('mean rating', MAE( eval_ds[eval_ds['mean_rating'] > 0], 'rating', 'mean_rating' )))
for (name,func) in distance_functions:
    print ("MAE of {0} is {1} ".format(name, MAE( eval_ds[eval_ds[name] > 0], 'rating', name )))

print ("RMSE of {0} is {1}".format('mean rating', RMSE( eval_ds[eval_ds['mean_rating'] > 0], 'rating', 'mean_rating' )))
for (name,func) in distance_functions:
    print ("RMSE of {0} is {1} ".format(name, RMSE( eval_ds[eval_ds[name] > 0], 'rating', name )))


MAE of mean rating is 0.7055168840093711
MAE of euclidean is 0.2629332215487124 
MAE of cosine is 0.4650109666699926 
MAE of pearson is 0.5312093797417275 
RMSE of mean rating is 0.920840633867146
RMSE of euclidean is 0.35397089006040633 
RMSE of cosine is 0.5992381164489905 
RMSE of pearson is 0.7043043690726212 


In [835]:
eval_ds.count()

userId            20000263
movieId           20000263
rating            20000263
timestamp         20000263
mean_rating       20000263
predict_rating    20000263
dtype: int64

In [644]:

0.689750383274
0.534087559871
0.869298083996
0.674865577278

Unnamed: 0,movieId,predictedRate,title
1,1,3.604548,Toy Story (1995)
7,7,3.95618,Sabrina (1995)
95,95,3.715045,Broken Arrow (1996)
780,780,4.243071,Independence Day (a.k.a. ID4) (1996)
2683,2683,2.906671,Austin Powers: The Spy Who Shagged Me (1999)
2688,2688,3.736359,"General's Daughter, The (1999)"
2701,2701,1.75,Wild Wild West (1999)
2706,2706,2.425205,American Pie (1999)
2713,2713,2.434502,Lake Placid (1999)
2716,2716,4.250102,Ghostbusters (a.k.a. Ghost Busters) (1984)


In [488]:
for id,row in nn_rated.iteritems():
    jsum=0
    wsum=0
    for v in row.dropna().items():
        jsum += similarities.loc[v[0]]['sim']
        wsum += (v[1]*similarities.loc[v[0]]['sim'])
    print ("{0} : {1}".format(id, wsum/jsum))
      

47 : 4.15571886580929
50 : 4.3706745093591035
110 : 4.258894070248855
296 : 4.187300965397262
318 : 4.669159331214638
344 : 3.714332189378028
356 : 4.416330561732522
480 : 3.1280491671238684
500 : 3.607917014282175
527 : 3.929389456979297
589 : 3.810787600196478
590 : 4.165356050627398
593 : 4.503391389613891
597 : 3.5210085244963127
608 : 3.694731812714421
1198 : 3.8263957177194405
1210 : 3.4039614117563164
1704 : 4.514166975374401
2028 : 4.156865802670894
2571 : 3.7959702197908363
2762 : 3.6806326896708073
2858 : 3.827340922074615
3949 : 2.9636188998406983
3996 : 2.70412827918192
4226 : 3.718070248316179
5445 : 3.913310521753981
5952 : 4.251638938695222
5989 : 3.9112102464163123
6016 : 4.2046905593468376
7153 : 4.119763377036935


movieId,47,50,110,296,318,344,356,480,500,527,...,2762,2858,3949,3996,4226,5445,5952,5989,6016,7153
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
498,,,4.0,4.0,5.0,,4.0,2.0,,,...,,4.0,,,,,,,,
336,,,,,4.0,,,,,5.0,...,,4.0,,,,,,,,
321,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,,4.0,...,,,,,,,,,,
60,,4.5,,4.5,4.0,,,,,,...,,4.0,2.0,,3.5,,4.5,,4.5,4.5
24,,5.0,5.0,,5.0,3.0,,,,,...,,,,,,,,,,
449,4.5,,,4.5,,,4.0,,,,...,4.0,4.5,,2.5,4.5,4.5,,3.5,4.0,
409,,,4.0,4.0,5.0,,,,2.5,,...,2.0,3.0,,,,,,,,
308,,,,,,,4.0,,,3.5,...,,,,,,,,,,
26,3.0,5.0,5.0,3.0,,3.0,5.0,5.0,5.0,4.0,...,,,,,,,,,,
105,,,3.0,4.0,5.0,,5.0,1.0,,,...,4.0,4.0,4.0,,,,,5.0,,
