## word2vec으로 추천 시스템 구현하기 

In [1]:
from gensim.models import Word2Vec
import pandas as pd
from collections import defaultdict
import numpy as np
import mykmeans as kmeans

In [13]:
## string to int with ignore none disit
def toint(s,default=0):
    converted = default
    try :
        converted = int(s)
    except:
        #print('string convert exception "{0}" (default={1})"'.format(s , default))
        pass
    return converted

class m2v_recsys:
    model = None
    home_dir = '/Users/goodvc/Documents/data-analytics/movie-recommendation/'
    w2v_env = { 'min_count':5, 'size':100, 'window':5 }
    nn_func = None

    ## initialize
    def __init__(self, ds_type='ml-latest-small'):
        self.ds_type = ds_type
        self.movieLensDataLoad()

    ## dataset load
    def movieLensDataLoad(self, ds_type=None):
        if ds_type != None:
            self.ds_type = ds_type
        ## user 영화 별점 data 
        self.ratings = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='ratings.csv'))

        ## movie meta(타이트,장르) data 
        self.movies = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='movies.csv'))
        ## split title and release year . ex) Nixon (1995) => Nixon , 1995
        self.movies['year'] = self.movies['title'].apply(lambda x: toint(x.strip()[-5:-1], 1950))
        self.movies['title'] = self.movies['title'].apply(lambda x: x[:-7].strip())
        
        ## user가 영화에 tag를 기입한 data
        self.tags = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='tags.csv'))

        ## join ratings and movies by movieId 
        self.ratings_movie = pd.merge( self.ratings, self.movies, on='movieId' )
    
    def load_model(self, model_path):
        self.model = Word2Vec.load(model_path)
        print('model loaded', model_path)
        
    def save_model(self, model_path):
        self.model.save(model_path)
 

    def training(self):
        if self.model == None :
            self.scentences = self.make_sentences()
            self.model = Word2Vec(self.scentences, 
                                  min_count = self.w2v_env['min_count'],
                                  size = self.w2v_env['size'], 
                                  window = self.w2v_env['window'])
            print('model trained')
            
    def nearest_neighbors_by_m2v(self, target, topn=10):
        if str == type(target):
            target=[target]
        result = []
        try:
            result = self.model.most_similar(positive=target, topn=topn)
        except:
            pass
        return result

    def user_interests(self, userId):
        return self.ratings_movie[self.ratings_movie.userId==userId][['title','rating']].values.tolist()
    
    def cal_score(self, title, sim, rating, pre_score):
        score = np.linalg.norm([sim, rating/5])
        score = score+pre_score
        return score
    
    def recommend_movies(self, userId, topn=10):
        movies = defaultdict(float)
        interests = self.user_interests(userId)
        seenMovies = set( m for m,s in interests )
        for title, rating in interests:
            neighbors = self.nearest_neighbors_by_m2v(title)
            for (title2, sim) in neighbors:
                if title2 in seenMovies:
                    continue
                pre = movies.get(title2, 0.0)
                movies[title2] = self.cal_score( title2, sim, rating, pre ) 
        return {'recommended' : sorted(movies.items(), key=lambda x: x[1], reverse=True)[:topn]
                ,'ratings':interests}

    ####################################
    ## clustering methods
    def clustering(self, k=100, delta=0.00000001, maxiter=100):
        self.movie_vec = self.model.syn0
        centres, index2cid, dist = kmeans.kmeanssample(self.movie_vec, k, 
                                                       metric = 'cosine', 
                                                       delta = delta, 
                                                       nsample = 0, maxiter = maxiter,)
        self.clustered_ds = pd.DataFrame( [ (a, b, c) for a, b, c in zip(self.model.index2word, index2cid, dist )],
                     columns=['title', 'cid', 'dist'] ).sort(['cid','dist'], ascending=True)
        
        self.movie2cid = { movie:cid for movie,cid in zip(self.model.index2word, index2cid) }
        
        return (centres, index2cid, dist)
    
    def clusterMembersByCId(self, cid):
        return self.clustered_ds[self.clustered_ds.cid==cid]
    
    def clusterMembersByTitle(self, title):
        cid = self.clustered_ds[self.clustered_ds.title==title]['cid'].values[0]
        return self.clusterMembersByCId(cid) 
    
    def clusterTags(self, cid):
        ds = pd.merge(self.tags
                 , self.clusterMembersByCId(cid)
                 , on='title')
        ds['dist'] = 1 - ds['dist']
        cid_tags = ds.groupby(['tag'])['dist'].sum().sort(inplace=False,ascending=False)[:5]
        return cid_tags
    ######################################
    
    def user_mcate_interests(self, uid, v=False):
        interests = self.user_interests(uid)
        cid_interests = defaultdict(int)
        total = 0
        for title, rating in interests:
            cid_interests[self.movie2cid.get(title, -1)] += rating/5
            total += (rating/5)
        interests_ds = pd.DataFrame. from_dict({'score':cid_interests},orient='columns')
        interests_ds['score'] = round(interests_ds['score'] / total,5)
        interests_ds.sort(['score'], ascending=False, inplace=True)
        
        if True==v:
            for cid, score in interests_ds.iterrows():
                print(cid,score.values[0],cid,self.clusterTags(cid).index.values[:3])
        return interests_ds

In [14]:
## 20m dataset
m2v_rs_ml20 = m2v_recsys('ml-20m')
m2v_rs_ml20.load_model('./resource/m2v_rs.ml20.model')

model loaded ./resource/m2v_rs.ml20.model


In [12]:
m2v_rs_ml20.load_model('./resource/m2v_rs.ml20.model')

AttributeError: 'm2v_recsys' object has no attribute 'load_model'

In [15]:
(centres, index2cid, dist ) = m2v_rs_ml20.clustering(k=500, maxiter=200, delta=1e-20)

kmeans: X (5000, 100)  centres (500, 100)  delta=1e-20  maxiter=200  metric=cosine
kmeans: 20 iterations  cluster sizes: [ 0  5 12 16 19 21 10 12  4  0  4  1 19 17  3  1 20 14 10 11  5 11 14  1 14
  3  2  8  2  1  4  6 11  2 17  4 11 28 23  2 27  6 28 21 14  5  8  2  1  3
 13  8  7  0  0 22  4 14  8  0  1 12  1 18  5  3  3  3  9 29 11  0 20  7 11
  6  2 25 14  1  2  7 10  9 23  0 35  1  6 23  1  1 17 17  9 16 11  5 14 20
 10 10  7  2  5  1  6 13 14  5  9  0 39 39 16  5 19 11  5  3  1 33 16 10  2
  6  7 18 18  3  9  6 17 12 12  3  4  8 13  9  1  7 11 23  8  5 10 10  5 18
  3  1  2 13 12  1  4  1  3 10 13 14  3 15  1  4 15  0 15 10 10  9  0  5 15
  3  2  5  9 16  2  4  0  9  4  1 10  1 11  5 29 28  3  6  5  8  0 17  4  1
  4 12  8  8 11 24 15  4 15  0  9 11  2 13  8  2  8  4 10  2  4 10 12 22 13
  3 27 13 13 11  1 11 13  9  3  8  6 20  4  0 16 28  8  5  8 21  4  7  8  1
  0 12 28 12 25  9 10  1  8 13  7 14  4  8 25 13  5  7 15 18 18 19 31 17  8
 35 16 10 14 17  4  1  7  2 10  5  2  5  1 

In [17]:
m2v_rs_ml20.clusterMembersByCId(100)

Unnamed: 0,title,cid,dist
10737,"Other Side of the Bed, The (Otro lado de la ca...",100,0.037987
7028,"Pinochet Case, The (Cas Pinochet, Le)",100,0.038202
2668,Yossi & Jagger,100,0.04007
2937,Together (Han ni Zai Yiki),100,0.041565
12551,Chain of Fools,100,0.047418
5919,"Mother, The",100,0.057602
9916,One Week,100,0.058088
1697,"Wild Geese, The",100,0.059159
11660,I'm Going Home (Je rentre à la maison),100,0.061242
9172,Maborosi (Maboroshi no hikari),100,0.066985


In [19]:
m2v_rs_ml20.clusterMembersByTitle('Pretty Woman')

Unnamed: 0,title,cid,dist
1876,"Firm, The",183,0.129432
4569,While You Were Sleeping,183,0.156543
1060,Outbreak,183,0.181285
3404,Crimson Tide,183,0.18847
8326,Interview with the Vampire: The Vampire Chroni...,183,0.191397
10166,Ghost,183,0.200918
6503,Four Weddings and a Funeral,183,0.215734
11935,Sleepless in Seattle,183,0.216425
7893,GoldenEye,183,0.224881
2799,Clueless,183,0.231373


In [20]:
m2v_rs_ml20.clusterMembersByTitle('Nixon')

Unnamed: 0,title,cid,dist
11792,Georgia,168,0.028789
3912,Moll Flanders,168,0.034122
8415,Before and After,168,0.051605
10120,Once Upon a Time... When We Were Colored,168,0.056969
391,"Cry, the Beloved Country",168,0.069256
10800,I Shot Andy Warhol,168,0.072733
6401,Stealing Beauty,168,0.072948
10451,"Chamber, The",168,0.076375
8303,Last Dance,168,0.078322
9375,"Crossing Guard, The",168,0.07857


In [22]:
for cid in range(100,130):
    print('cid :',cid,m2v_rs_ml20.clusterTags(cid).index.values[:3])

KeyError: 'title'

In [605]:
m2v_rs_ml20.clusterMembersByCId(90)

Unnamed: 0,title,cid,dist
11983,Pirates Who Don't Do Anything: A VeggieTales M...,90,0.023139
1873,Full of It,90,0.056454
11161,Billabong Odyssey,90,0.076041
13089,Baggage Claim,90,0.076166
1791,Hell Comes to Frogtown,90,0.076448
10915,MacGyver: Lost Treasure of Atlantis,90,0.087512
1031,"Pebble and the Penguin, The",90,0.090192
4074,Killer Bean 2: The Party,90,0.092154
3504,Super Sucker,90,0.092339
12435,"Blackout, The",90,0.094893


In [606]:
m2v_rs_ml20.clusterMembersByTitle('Matrix, The')

Unnamed: 0,title,cid,dist
3000,"Sixth Sense, The",342,0.145427
8826,"Matrix, The",342,0.146262
3664,Saving Private Ryan,342,0.166291
8681,"Princess Bride, The",342,0.20989
13095,L.A. Confidential,342,0.230251
11742,American Beauty,342,0.233694
12119,Good Will Hunting,342,0.247404
10507,Monty Python and the Holy Grail,342,0.250881
11952,Fight Club,342,0.265294
6461,Raiders of the Lost Ark (Indiana Jones and the...,342,0.280892


In [497]:
m2v_rs.clustering()

kmeans: X (1000, 100)  centres (100, 100)  delta=1e-08  maxiter=100  metric=cosine
kmeans: 11 iterations  cluster sizes: [ 4  8  4  9  5 10  5 24 11 86  3  5  4  3  6  4 10  2  5  9  5 51  3  9 25
  4 16  2 11  6  6  5 15  4  7 10  4  8  7 15  6  5  4  5  3  9  6  7 22 14
  8 10  6  5  7  9 64 12  3  9 12  3  4  3  8 11 32  4  4  7  2  9  6 10 11
  3  6 31  9  6 10  8 16  7  8  6 11  6 16  6 14  7  9  7  8  5  7  9  8  7]
kmeans: X (1099, 100)  centres (100, 100)  delta=1e-08  maxiter=100  metric=cosine
kmeans: 6 iterations  cluster sizes: [  5   8   5  10   6  12   5  23  12 101   3   8   4   5   7   5  10   5
   5  10   5  57   4  11  25   5  17   2  11   7   7   6  16   4   7  11
   5   8   7  16   8   6   6   5   3  10   6   7  23  15   8  10   6   5
  10   9  66  12   3  10  13   4   4   3   8  11  33   4   4   8   2  10
   6  14  11   5   6  34   9   6  13   9  18   8   9   6  12   6  17   6
  14   8  11   8   8   5   7  13   9   9]


(array([ 9,  9,  5, ..., 10, 56, 88]),
 array([[ -2.05094717e-03,   9.53356270e-04,   9.24346023e-05, ...,
          -1.03916787e-03,   1.68464263e-03,   1.30630215e-03],
        [  2.97280902e-04,  -1.09126291e-03,   1.41588598e-03, ...,
          -1.52803003e-03,  -1.86778745e-03,   8.85844813e-04],
        [  3.26073426e-03,  -1.84293871e-03,   1.99179118e-03, ...,
           8.54265410e-04,   2.14851781e-04,  -3.81570775e-04],
        ..., 
        [  1.34107075e-03,   1.44365162e-03,   4.71084175e-04, ...,
           2.47493805e-03,   9.32482071e-04,  -3.76198877e-04],
        [ -1.88021036e-03,  -2.52586673e-04,  -3.20180971e-03, ...,
          -2.15061867e-04,  -1.76366209e-03,   1.82774977e-03],
        [  2.44270195e-03,   1.25406054e-03,   2.93661375e-03, ...,
           2.71253451e-03,   2.64538336e-03,  -4.71767766e-04]], dtype=float32))

In [629]:
m2v_rs_ml20.clusterTags(81)

tag
stupid              8.137575
high school         7.826824
Robert Rodriguez    5.926682
Crappy Remake       4.857144
based on a book     4.464471
Name: dist, dtype: float64

In [666]:
interests = m2v_rs_ml20.user_interests(10)
cid_interests = defaultdict(int)
total = 0
for title, rating in interests:
    cid_interests[m2v_rs_ml20.movie2cid.get(title, -1)] += rating/5
    total += (rating/5)
interests_ds = pd.DataFrame. from_dict({'score':cid_interests},orient='columns')
interests_ds['score'] = (interests_ds['score'] / total)
interests_ds.sort(['score'], ascending=False, inplace=True)

#for cid in cid_interests.keys():
#    cid_interests[cid] = round(cid_interests[cid]/total,5)

Unnamed: 0,score
487,0.222973
480,0.189189
299,0.101351
125,0.087838
441,0.074324
322,0.054054
452,0.047297
105,0.027027
265,0.027027
277,0.027027


In [670]:
m2v_rs_ml20.user_mcate_interests(1)

Unnamed: 0,score
15,0.25
363,0.162162
275,0.128378
226,0.094595
5,0.047297
142,0.047297
235,0.047297
143,0.027027
154,0.027027
212,0.027027


In [690]:
interests = m2v_rs_ml20.user_mcate_interests(100, v=True)

TypeError: type Series doesn't define __round__ method

In [691]:
for cid, score in interests.iterrows():
    print(cid,score.values[0],cid,m2v_rs_ml20.clusterTags(cid).index.values[:3])

280 0.128378378378 280 ['horror' 'Betamax' 'slasher']
128 0.121621621622 128 ['nostalgic' 'espionage' 'karate']
296 0.108108108108 296 ['Betamax' 'nudity (topless)' 'BD-R']
251 0.0743243243243 251 []
52 0.0608108108108 52 ['BD-R' 'less than 300 ratings' 'blaxploitation']
79 0.0608108108108 79 ['stranded' 'wartime' 'soldier']
5 0.0540540540541 5 ['less than 300 ratings' 'lawyer as protagonist?' 'Legal']
101 0.0540540540541 101 ['coming of age' 'quirky' 'stylized']
481 0.0472972972973 481 ['Hugh Dancy' 'Rose Byrne' 'asperger syndrome']
413 0.0405405405405 413 ['Hulk Hogan' 'franchise' 'Thanatos!']
126 0.027027027027 126 ['Buster Keaton' 'DVD-R' 'Eddie Izzard']
6 0.027027027027 6 ['Howard Hughes' 'perrot library' 'motorcycle']
110 0.027027027027 110 ['drugs' 'Johnny Depp' 'poker']
278 0.027027027027 278 ['funny' 'beautiful' 'cliche']
292 0.027027027027 292 ['treasure' 'family' 'supernatural']
376 0.027027027027 376 ['superhero' 'Johnny Depp' 'Will Smith']
437 0.027027027027 437 []
86 0.02

In [674]:
interests

Unnamed: 0,score
15,0.25
363,0.162162
275,0.128378
226,0.094595
5,0.047297
142,0.047297
235,0.047297
143,0.027027
154,0.027027
212,0.027027
