In [2]:
# coding: utf-8

import pandas as pd
import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as plt
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
%matplotlib inline  

rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
matplotlib.style.use('ggplot')
pd.options.display.max_rows=14


## 무비 렌즈 데이터로 별점을 예측해 보자 
* User Based 별점 예측 
* Item(Movie) Based 별점 예측

Movie Lens 데이터 로드 
http://grouplens.org/datasets/movielens/

In [3]:
def movieLensDataLoad(type):
    ## user 영화 별점 data 
    ratings = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/ratings.csv")

    ## movie meta(타이트,장르) data 
    movies = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/movies.csv")

    ## user가 영화에 tag를 기입한 data
    tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/"+type+"/tags.csv")
    # tags = pd.read_csv("/Users/goodvc/Documents/data-analytics/movie-recommendation/ml-20m/tags.csv")
    return ( ratings, movies, tags )

#ratings, movies, tags = movieLensDataLoad('ml-20m')
ratings, movies, tags = movieLensDataLoad('ml-latest-small')

In [4]:
#ratings = pd.read_csv("movieLens/ml-latest-small/ratings.csv")
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,6,2,980730861
1,1,22,3,980731380


In [5]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [6]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,40,1,animation,1306926135
1,40,1,fantasy,1306926130


### User Based 별점 예측 

U(User) 
M(Movie)

1. U X M vector Matrix를 만든다. 
 key가 userid, value가 { 'movieId':rating } 
2. 나와 비슷한 유저를 찾는다. 

In [7]:
## 1. U X M vector Matrix를 만든다. 
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')
print('before filtering ', UM_matrix_ds.values.size)

('before filtering ', 6037712)


In [8]:
## filtering movies
## 5개 이하 
movie_rating_count = ratings.groupby(['movieId'])['rating'].count()
except_list = movie_rating_count[movie_rating_count<5].index
filtered_ratings = ratings[(~ratings['movieId'].isin(except_list))]

In [9]:
## 1. U X M vector Matrix를 만든다. 
UM_matrix_ds = filtered_ratings.pivot(index='userId', columns='movieId', values='rating')
print('after filtering ', UM_matrix_ds.values.size)

('after filtering ', 2338978)


In [10]:
UM_matrix_ds.values.size

2338978

In [11]:
## 그럼 이제 최근접 이웃을 찾차 보자
## 
import math
from operator import itemgetter
from scipy.spatial import distance

def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

def disance_corr(a,b):
    return 1-distance.correlation (a,b)

def distance_euclidean(a,b):
    return 1/(distance.euclidean(a,b)+1)

## None-Clustered

In [12]:
def nearest_neighbor_user( user, topN, simFunc ) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    ## Brote Force Compute
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
       
    ## At least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        ## similarity functon
        sim = simFunc(interSectionU1,interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    ## top N returned         
    return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

### Pre-Clustered

In [13]:
def graph_cut( ds, threshold=3 ):
    grouped = ds.groupby('movieId')
    userlink = defaultdict(int)
    for name, g in grouped:
        users = g['userId'].values
        cnt = users.size
        if cnt<2:
            continue

        for n in range(0,cnt):
            for m in range(n+1,cnt):
                key = "{0}:{1}" if users[n]<users[m] else "{1}:{0}"
                key = key.format(users[n], users[m])
                userlink[key] = userlink[key] + 1
                
    neighbors = defaultdict(list)
    n=0
    for k, v in userlink.items():
        if (v) > threshold:
            nm = k.split(':')
            if len(nm) == 2:
                neighbors[int(nm[0])].append(int(nm[1]))
                neighbors[int(nm[1])].append(int(nm[0]))
                n = n+1
                
    #print("user link수 : ", n)
    return neighbors

neighbors = graph_cut(ratings)


def clustered_nearest_neighbor_user( user, topN, simFunc ) :  
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    members = neighbors[user]
    ## pre-clustered Compute
    for uid in members:
        interSectionU1 = []
        interSectionU2 = []
        if uid==user:
            continue
        row = UM_matrix_ds.loc[uid]
        for i in ratedIndex:
            if False==math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
       
    ## At least 3 intersection items
        if interSectionLen < 3 :
            continue
            
        ## similarity functon
        sim = simFunc(interSectionU1,interSectionU2)
        
        if  math.isnan(sim) == False:
            nn[uid] = sim
            
    ## top N returned         
    return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

In [14]:
UM_matrix_ds.loc[10]

movieId
1           4
2         NaN
3         NaN
4         NaN
5         NaN
...
111759    NaN
112556    NaN
112623    NaN
112852    NaN
115617    NaN
Name: 10, Length: 3313, dtype: float64

In [15]:
st=time.time()
print(nearest_neighbor_user(2,50,distance_cosine))
print(time.time()-st, 'sec')

st=time.time()
print(clustered_nearest_neighbor_user(2,50,distance_cosine))
print(time.time()-st, 'sec')

[(321, 1.0000000000000002), (336, 0.99999999999999989), (498, 0.99999999999999978), (105, 0.99963038254143022), (308, 0.99902486568714022), (183, 0.9989706289089969), (675, 0.99864916181932428), (630, 0.99863230431688021), (138, 0.99841721431714503), (290, 0.99840382978858955), (18, 0.99840382978858955), (244, 0.99822164389532697), (449, 0.99781579644559826), (26, 0.99627250721874983), (6, 0.99591000331047874), (574, 0.99540661581406098), (60, 0.99530644559096504), (260, 0.99524079127088427), (306, 0.99522750237829727), (515, 0.99502588831258199), (325, 0.99498743710661985), (685, 0.9949366763261821), (195, 0.9949366763261821), (127, 0.99493667632618188), (13, 0.99487339879682413), (430, 0.99484975116710983), (472, 0.99453584235718751), (57, 0.99444002464459091), (681, 0.99413484677243424), (217, 0.99413484677243413), (450, 0.99406708268692501), (35, 0.99399908854796626), (577, 0.99388373467361879), (56, 0.99385869319577624), (186, 0.99380892501375828), (690, 0.99349392130816161), (348

In [16]:
def predictRating(userid, nn=100, simFunc=distance_euclidean, nnFunc=nearest_neighbor_user) :
   
    ## neighboorhood 
    neighbor = nnFunc(userid,nn,simFunc)
    neighbor_id = [id for id,sim in neighbor]
    
    ## neighboorhood's movie : al least 4 ratings
    neighbor_movie = UM_matrix_ds.loc[neighbor_id]\
                    .dropna(1, how='all', thresh = 4 )
    neighbor_dic = (dict(neighbor))
    ret = [] # ['movieId', 'predictedRate']
    
    ## rating predict by my similarities 
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dic.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
       
    return ret

In [17]:

st=time.time()
(predictRating(3,20,distance_euclidean, nearest_neighbor_user))
print('none-clustered', time.time()-st)

st=time.time()
(predictRating(3,20,distance_euclidean, clustered_nearest_neighbor_user))
print('pre-clustered ',time.time()-st)

('none-clustered', 0.7082309722900391)
('pre-clustered ', 0.5813510417938232)


In [18]:
## user의 별점 매긴 영화와 영화 정보 높은 별점순으로 보기 
def ratingMovies(userid):
    ds = pd.merge(ratings[ratings.userId==userid], movies, on=['movieId'])
    return ds.sort(['rating'],ascending=False)[['rating','title','genres','movieId']].head(100)

def join_movie_info( predicted_result ):
    predicted_ratings = pd.DataFrame(predicted_result, columns=['movieId', 'predicted_rating'])
    result_ds = pd.merge( movies[movies.movieId > 0], predicted_ratings, on=['movieId'])
    return result_ds.sort(['predicted_rating'], ascending=False)

result = predictRating(1);
join_movie_info(result)

Unnamed: 0,movieId,title,genres,predicted_rating
106,1080,Monty Python's Life of Brian (1979),Comedy,4.915104
131,2019,Seven Samurai (Shichinin no samurai) (1954),Action|Adventure|Drama,4.875395
134,2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War,4.660577
16,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.629133
73,497,Much Ado About Nothing (1993),Comedy|Romance,4.611773
174,3578,Gladiator (2000),Action|Adventure|Drama,4.611430
193,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,4.567527
...,...,...,...,...
55,344,Ace Ventura: Pet Detective (1994),Comedy,2.442684
160,2805,Mickey Blue Eyes (1999),Comedy|Romance,2.357886


In [19]:
eval_ratings = ratings
## evaluation
eval_ds = pd.merge(eval_ratings, 
                   ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                   on='movieId', how='left')

eval_ds = eval_ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})

In [20]:
# ratings['userId'].drop_duplicates().values[:]
def eval_prediction( predict_users, eval_ds, n_users=50 ):
    st = time.time()
    ## udpate to predict_rating 
    distance_functions = [ ('euclidean',distance_euclidean), ('cosine', distance_cosine) ]
    for name, func in distance_functions:
        eval_ds[name] = 0
        for userId in predict_users:
            for x in predictRating(userId, n_users, func):
                eval_ds.loc[(eval_ds.userId==userId) & (eval_ds.movieId==x[0]),name]=x[1]
    print('elapsed', round(time.time()-st,2), 'sec')
    return eval_ds[eval_ds.euclidean+eval_ds.cosine>0]



In [21]:
# ratings['userId'].drop_duplicates().values[:]
def eval_prediction( predict_users, ds,  n_users=50 ):
    st = time.time()
    ## udpate to predict_rating 
    distance_functions = [ ('euclidean',distance_euclidean), ('cosine', distance_cosine) ]
    for name, func in distance_functions:
        ds[name] = 0
        for userId in predict_users:
            for x in predictRating(userId, n_users, func):
                ds.loc[(ds.userId==userId) & (ds.movieId==x[0]),name]=x[1]
    print('elapsed', round(time.time()-st,2), 'sec')
    return ds[ds.euclidean+ds.cosine>0]

In [22]:
## 전체 userId list 
users = UM_matrix_ds.index.tolist()

In [23]:
## 10명 별점 예측
predicted = eval_prediction( users[:2], eval_ds, 100 )

('elapsed', 7.36, 'sec')


In [26]:
def RMSE(X, left_col, right_col):
    return(np.sqrt(np.mean((X[left_col] - X[right_col])**2)))

def MAE(X, left_col, right_col):
    return(np.mean(np.absolute(X[left_col] - X[right_col])) )


for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("MAE of {0} is {1} ".format(name, MAE( predicted[predicted[name] > 0], 'rating', name )))

for name in ['mean_rating', 'cosine', 'euclidean']:
    print ("RMSE of {0} is {1} ".format(name, RMSE( predicted[predicted[name] > 0], 'rating', name )))


MAE of mean_rating is 0.747331394447 
MAE of cosine is 0.532438445171 
MAE of euclidean is 0.477710512547 
RMSE of mean_rating is 0.887309417815 
RMSE of cosine is 0.638476358644 
RMSE of euclidean is 0.581608714321 
