In [2]:
# coding: utf-8

import pandas as pd
import numpy as np
from matplotlib import rcParams
import matplotlib.pyplot as plt
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
import math
from operator import itemgetter
from scipy.spatial import distance
%matplotlib inline  

rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 12})
matplotlib.style.use('ggplot')
pd.options.display.max_rows=14


## 무비 렌즈 데이터로 별점을 예측해 보자 
* User Based 별점 예측 
* Item(Movie) Based 별점 예측

Movie Lens 데이터 로드 
http://grouplens.org/datasets/movielens/

In [3]:
class ub_ressys:
    home_dir = '/Users/goodvc/Documents/data-analytics/movie-recommendation/'
    nn_func = None
    sim_func = None
    
    
    def __init__(self, dataname='ml-latest-small'):
        self.ds_type = dataname
        self.nn_func = self.nearest_neighbor_user
        self.sim_func = self.distance_euclidean
        self.movieLensDataLoad()
        self.UM_matrix_ds = self.ratings.pivot(index='userId', columns='movieId', values='rating')
    
    def movieLensDataLoad(self):
        ## user 영화 별점 data 
        self.ratings = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='ratings.csv'))

        ## movie meta(타이트,장르) data 
        self.movies = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='movies.csv'))

        ## user가 영화에 tag를 기입한 data
        self.tags = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='tags.csv'))


    def compute(self):
        ## 1. U X M vector Matrix를 만든다. 
        self.UM_matrix_ds = self.ratings.pivot(index='userId', columns='movieId', values='rating')
        
        ## pre-clustering
        #self.graph_cut()

    ## filtering movies by rating frequency
    def cut_movie(self, threshold=5):
        movie_rating_count = self.ratings.groupby(['movieId'])['rating'].count()
        except_list = movie_rating_count[movie_rating_count<threshold].index
        self.ratings = self.ratings[(~self.ratings['movieId'].isin(except_list))]
        return (self.ratings.count()[0])
        
    ## distance functions
    def distance_cosine(self, a, b):        return 1-distance.cosine(a,b)
    def disance_corr(self, a, b):           return 1-distance.correlation (a,b)
    def distance_euclidean(self, a, b):     return 1/(distance.euclidean(a,b)+1)
    
    ## none-cluster nn functions
    def nearest_neighbor_user(self, user, topN, simFunc ) :  
        u1 = self.UM_matrix_ds.loc[user].dropna()
        ratedIndex = u1.index
        nn = {}

        ## Brote Force Compute
        for uid, row in self.UM_matrix_ds.iterrows():
            interSectionU1 = []
            interSectionU2 = []
            if uid==user:
                continue
            for i in ratedIndex:
                if False==math.isnan(row[i]):
                    interSectionU1.append(u1[i])
                    interSectionU2.append(row[i])
            interSectionLen = len(interSectionU1)

            ## At least 3 intersection items
            if interSectionLen < 3 :
                continue

            ## similarity functon
            sim = simFunc(interSectionU1,interSectionU2)

            if  math.isnan(sim) == False:
                nn[uid] = sim
        
        ## top N returned         
        return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

    ## rating prediction by user
    def predictRating(self, userid, nn=100) :
        ## neighboorhood 
        neighbor = self.nn_func(userid, nn, self.sim_func)
        neighbor_id = [id for id,sim in neighbor]

        ## neighboorhood's movie : al least 4 ratings
        neighbor_movie = self.UM_matrix_ds.loc[neighbor_id]\
                        .dropna(1, how='all', thresh = 4 )
        neighbor_dic = (dict(neighbor))
        ret = [] # ['movieId', 'predictedRate']

        ## rating predict by my similarities 
        for movieId, row in neighbor_movie.iteritems():
            jsum, wsum = 0, 0
            for v in row.dropna().iteritems():
                sim = neighbor_dic.get(v[0],0)
                jsum += sim
                wsum += (v[1]*sim)
            ret.append([movieId, wsum/jsum])

        return ret

    
    ## evaluation 
    def evaluation(self, user_list, acc_func, nn=50):
        eval_ds = pd.merge(self.ratings, 
                           self.ratings[['movieId','rating']].groupby(['movieId']).mean().reset_index(), 
                           on='movieId', how='left')
        eval_ds = eval_ds.rename(columns= {'rating_x':'rating', 'rating_y':'mean_rating'})
        st = time.time()
        name = self.sim_func.__name__
        eval_ds[name] = 0
        for userId in user_list:
            for x in self.predictRating(userId, nn):
                eval_ds.loc[(eval_ds.userId==userId) & (eval_ds.movieId==x[0]),name]=x[1]
        print('elapsed', round(time.time()-st,2), 'sec')
        eval_ds = eval_ds[eval_ds[name]>0]
        
        pzn_acc = acc_func(eval_ds, 'rating', name)
        nopzn_acc = acc_func(eval_ds, 'rating', 'mean_rating')
        return (pzn_acc, nopzn_acc, eval_ds)
    
    def graph_cut(self, threshold=3 ):
        st = time.time()
        grouped = self.ratings.groupby('movieId')
        userlink = defaultdict(int)
        for name, g in grouped:
            users = g['userId'].values
            cnt = users.size
            if cnt<2:
                continue

            for n in range(0,cnt):
                for m in range(n+1,cnt):
                    key = "{0}:{1}" if users[n]<users[m] else "{1}:{0}"
                    key = key.format(users[n], users[m])
                    userlink[key] = userlink[key] + 1

        self.neighbors = defaultdict(list)
        n=0
        for k, v in userlink.items():
            if (v) > threshold:
                nm = k.split(':')
                if len(nm) == 2:
                    self.neighbors[int(nm[0])].append(int(nm[1]))
                    self.neighbors[int(nm[1])].append(int(nm[0]))
                    n = n+1
        cnt = len(self.UM_matrix_ds.index)
        return (time.time()-st, n/(cnt**2-cnt))

    def clustered_nearest_neighbor_user(self, user, topN, simFunc ) :  
        u1 = self.UM_matrix_ds.loc[user].dropna()
        ratedIndex = u1.index
        nn = {}

        members = self.neighbors[user]
        ## pre-clustered Compute
        for uid in members:
            interSectionU1 = []
            interSectionU2 = []
            if uid==user:
                continue
            row = self.UM_matrix_ds.loc[uid]
            for i in ratedIndex:
                if False==math.isnan(row[i]):
                    interSectionU1.append(u1[i])
                    interSectionU2.append(row[i])
            interSectionLen = len(interSectionU1)


        ## At least 3 intersection items
            if interSectionLen < 3 :
                continue

            ## similarity functon
            sim = simFunc(interSectionU1,interSectionU2)

            if  math.isnan(sim) == False:
                nn[uid] = sim

        ## top N returned         
        return sorted(nn.items(),key=itemgetter(1))[:-(topN+1):-1]

    ## accuracy measure functions
    def RMSE(self, X, left_col, right_col):
        return(np.sqrt(np.mean((X[left_col] - X[right_col])**2)))
    def MAE(self, X, left_col, right_col):
        return(np.mean(np.absolute(X[left_col] - X[right_col])) )
ub = ub_ressys()

* 별점 예측 객체 생성
 * euclidean distance
 * none-clustered nearest neighborhood


In [4]:
ub = ub_ressys()

In [5]:
ub.nearest_neighbor_user(1,20,ub.distance_euclidean)

[(651, 0.5),
 (591, 0.5),
 (424, 0.5),
 (513, 0.4721359549995794),
 (473, 0.4721359549995794),
 (436, 0.4494897427831781),
 (97, 0.4494897427831781),
 (81, 0.4494897427831781),
 (57, 0.4494897427831781),
 (510, 0.4142135623730951),
 (312, 0.4142135623730951),
 (300, 0.4142135623730951),
 (240, 0.4142135623730951),
 (220, 0.4142135623730951),
 (93, 0.4142135623730951),
 (67, 0.4142135623730951),
 (16, 0.4142135623730951),
 (12, 0.4142135623730951),
 (696, 0.4),
 (288, 0.4)]

In [6]:
## 1번 유저 별점 예측
ub.predictRating(1, nn=50)

[[1, 3.4037056775011036],
 [5, 3.7880346301990366],
 [7, 3.7225688611727579],
 [10, 2.910662569547644],
 [17, 4.2536377944507064],
 [21, 3.876925293640443],
 [34, 3.0091583344113118],
 [47, 4.0919373849714997],
 [50, 4.6724189719748637],
 [58, 3.7184236837779485],
 [62, 3.9431457628786357],
 [95, 3.6917382246426178],
 [110, 4.0068005970832674],
 [111, 4.442582878871959],
 [150, 3.86209117418655],
 [153, 3.3047768682234424],
 [161, 4.0113319573175863],
 [165, 3.6152577223455311],
 [168, 2.9604683881384299],
 [185, 3.2473510611766581],
 [208, 2.8561611500084645],
 [231, 2.0124919938884158],
 [253, 3.5058491490893102],
 [260, 4.399412931301053],
 [292, 3.2736639455223071],
 [296, 4.1727900109849676],
 [300, 3.7809955239502377],
 [316, 3.1872875454393865],
 [318, 4.1387105910850757],
 [329, 3.2918203784146387],
 [339, 3.0885877480656339],
 [342, 3.3831281861639639],
 [344, 2.3879777703823182],
 [349, 3.4906326740258118],
 [356, 3.8901807269769675],
 [364, 4.0067903364233546],
 [380, 3.6956

In [7]:
## 1번 유저 별점 예측
len(ub.predictRating(1, nn=100))

207

In [9]:
ub.UM_matrix_ds.columns.size

8552

In [10]:
print('cut ratings', ub.cut_movie(5))
ub.compute()

('cut ratings', 90637)


In [11]:
ub.UM_matrix_ds.columns.size

3313

In [12]:
## nn 개수별 별점 예측 개수
st = time.time()
for n in range(10,100,10):
    print(n, len(ub.predictRating(3,nn=n)))
time.time()-st

(10, 2)
(20, 36)
(30, 80)
(40, 128)
(50, 184)
(60, 246)
(70, 311)
(80, 359)
(90, 443)


4.826746940612793

In [13]:
## pre-clustered nearest_neighbor_user
ub.nn_func = ub.clustered_nearest_neighbor_user
ub.graph_cut(6)

(9.373823165893555, 0)

In [14]:
## nn 개수별 별점 예측 개수
ub.nn_func = ub.clustered_nearest_neighbor_user
st = time.time()
for u in range(1,2):
    st2=time.time()
    print(u, (ub.predictRating(u,nn=50)), time.time()-st2)
print('taken', time.time()-st)



(1, [[1, 3.6659365413392324], [2, 3.7369560639416313], [10, 3.3343617246061665], [11, 3.9778737904215093], [19, 2.3621866627634303], [21, 3.9179601446218735], [32, 3.4763016012919894], [34, 3.3340621819964396], [39, 3.7777611063068943], [47, 4.1710628138788461], [50, 4.7582776848172736], [95, 3.7115984699670603], [110, 4.1397831513784551], [150, 3.8185540299857821], [151, 4.5143725434245949], [153, 3.1611595952776819], [161, 4.0962776189452308], [165, 3.4588307195759791], [185, 3.2512322007955143], [208, 2.8591748732125688], [231, 2.8082156327059846], [253, 3.2682412344852949], [260, 4.5240059486457742], [266, 3.5752360382741895], [288, 3.6736983374400136], [292, 3.9055887417863406], [293, 4.4759040790385223], [296, 4.1414279728151051], [300, 3.5851854362955442], [316, 3.4812607288900952], [317, 3.2328643492905784], [318, 3.9467671592998643], [329, 3.4043843393431286], [337, 3.7911061307195917], [339, 3.1124983787432505], [344, 2.5871970856390649], [349, 3.7743821230124137], [350, 4.0]

In [15]:
## nn 개수별 별점 예측 개수
ub.nn_func = ub.nearest_neighbor_user
st = time.time()
for u in range(1,2):
    st2=time.time()
    print(u, (ub.predictRating(u,nn=50)), time.time()-st2)
print('taken', time.time()-st)

(1, [[1, 3.4037056775011036], [5, 3.7880346301990366], [7, 3.7225688611727579], [10, 2.910662569547644], [17, 4.2536377944507064], [21, 3.876925293640443], [34, 3.0091583344113118], [47, 4.0919373849714997], [50, 4.6724189719748637], [58, 3.7184236837779485], [62, 3.9431457628786357], [95, 3.6917382246426178], [110, 4.0068005970832674], [111, 4.442582878871959], [150, 3.86209117418655], [153, 3.3047768682234424], [161, 4.0113319573175863], [165, 3.6152577223455311], [168, 2.9604683881384299], [185, 3.2473510611766581], [208, 2.8561611500084645], [231, 2.0124919938884158], [253, 3.5058491490893102], [260, 4.399412931301053], [292, 3.2736639455223071], [296, 4.1727900109849676], [300, 3.7809955239502377], [316, 3.1872875454393865], [318, 4.1387105910850757], [329, 3.2918203784146387], [339, 3.0885877480656339], [342, 3.3831281861639639], [344, 2.3879777703823182], [349, 3.4906326740258118], [356, 3.8901807269769675], [364, 4.0067903364233546], [380, 3.695661087251719], [434, 2.8520608474

In [16]:
ub.UM_matrix_ds

movieId,1,2,3,4,5,6,7,8,9,10,...,109487,109673,110102,111360,111362,111759,112556,112623,112852,115617
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,2,,,,,...,,,,,,,,,,
2,,4,,,,,,,,,...,,,,,,,,,,
3,,,4,,4,,4,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,3.5,,2.5,4,3.5,3.5,3.5,3.5,4.5
7,5,4,,,,,,,,4,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,5,,,,,3,,,,,...,,,,,,,,,,
701,5,,,,,,,,,,...,,,,,,,,,,


In [18]:
#ub.nearest_neighbor_user(10,20, ub.distance_euclidean)
result = ub.evaluation(ub.UM_matrix_ds.index.values.tolist()[:10],ub.RMSE, 50)

('elapsed', 11.58, 'sec')


In [19]:
ub.RMSE(result[2],'rating', 'distance_euclidean')

0.4018427374962063

In [20]:
pd.options.display.max_columns=7
ub.UM_matrix_ds

movieId,1,2,3,...,112623,112852,115617
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,,,,...,,,
2,,4,,...,,,
3,,,4,...,,,
4,,,,...,,,
5,,,,...,,,
6,,,,...,3.5,3.5,4.5
7,5,4,,...,,,
...,...,...,...,...,...,...,...
700,5,,,...,,,
701,5,,,...,,,


In [21]:
ub.UM_matrix_ds = ub.UM_matrix_ds.T

In [22]:
ub.nearest_neighbor_user(54259,50,ub.distance_cosine)

[(6947, 1.0),
 (2145, 0.99995783704014163),
 (2144, 0.99995783704014163),
 (106100, 0.99989620993243),
 (2085, 0.99985131216927714),
 (5254, 0.99972310673704268),
 (1103, 0.99911540417554445),
 (3273, 0.99893660984868549),
 (40629, 0.99889001597131388),
 (2080, 0.99881305596152137),
 (48043, 0.99878588622519093),
 (81229, 0.99876543588546995),
 (920, 0.99872404752904342),
 (55267, 0.99868711846396474),
 (97304, 0.99868711846396474),
 (5791, 0.99864998121114379),
 (89804, 0.99863190418510939),
 (80549, 0.99859959726201575),
 (38038, 0.99858912779193576),
 (916, 0.99855642843062054),
 (87869, 0.99855404816674154),
 (3448, 0.99852398443533918),
 (55290, 0.99850675271343292),
 (53129, 0.9984319503336242),
 (3510, 0.9984303527440922),
 (1673, 0.99842372904805865),
 (56788, 0.99842372904805865),
 (4881, 0.99835585799055382),
 (84392, 0.99835323666490849),
 (431, 0.9982549042119192),
 (78039, 0.99820936589592835),
 (45431, 0.99820936589592835),
 (1213, 0.99819686764214333),
 (4161, 0.99813025