## word2vec으로 추천 시스템 구현하기 

In [1]:
from gensim.models import Word2Vec
import pandas as pd
from collections import defaultdict
import numpy as np
import mykmeans as kmeans
from __future__ import print_function

In [88]:
## string to int with ignore none disit
def toint(s,default=0):
    converted = default
    try :
        converted = int(s)
    except:
        #print('string convert exception "{0}" (default={1})"'.format(s , default))
        pass
    return converted

class m2v_recsys:
    model = None
    home_dir = '/Users/goodvc/Documents/data-analytics/movie-recommendation/'
    w2v_env = { 'min_count':5, 'size':100, 'window':5 }
    nn_func = None

    ## initialize
    def __init__(self, ds_type='ml-latest-small'):
        self.ds_type = ds_type
        self.movieLensDataLoad()

    ## dataset load
    def movieLensDataLoad(self, ds_type=None):
        if ds_type != None:
            self.ds_type = ds_type
        ## user 영화 별점 data 
        self.ratings = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='ratings.csv'))

        ## movie meta(타이트,장르) data 
        self.movies = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='movies.csv'))
        ## split title and release year . ex) Nixon (1995) => Nixon , 1995
        self.movies['year'] = self.movies['title'].apply(lambda x: toint(x.strip()[-5:-1], 1950))
        self.movies['title'] = self.movies['title'].apply(lambda x: x[:-7].strip())
        
        ## user가 영화에 tag를 기입한 data
        self.tags = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='tags.csv'))
        self.tags = pd.merge(self.tags, self.movies, on=['movieId'])[['userId', 'movieId', 'tag','title', 'timestamp']]

        ## join ratings and movies by movieId 
        self.ratings_movie = pd.merge( self.ratings, self.movies, on='movieId' )

    def make_sentences(self, threshold=3):
        ## corpus를 만듦
        corpus = []
        grouped = self.ratings_movie[self.ratings_movie.rating>=threshold].groupby(['userId'])
        for idx, user in grouped:
            corpus.append (user.sort(['timestamp'])['title'].tolist())
        return corpus
    
    def load_model(self, model_path):
        self.model = Word2Vec.load(model_path)
        print('model loaded', model_path)
        
    def save_model(self, model_path):
        self.model.save(model_path)
 

    def training(self):
        if self.model == None :
            self.scentences = self.make_sentences()
            self.model = Word2Vec(self.scentences, 
                                  min_count = self.w2v_env['min_count'],
                                  size = self.w2v_env['size'], 
                                  window = self.w2v_env['window'])
            print('model trained')
            
    def nearest_neighbors_by_m2v(self, target, topn=10):
        if str == type(target):
            target=[target]
        result = []
        try:
            result = self.model.most_similar(positive=target, topn=topn)
        except:
            pass
        return result

    def user_interests(self, userId):
        return self.ratings_movie[self.ratings_movie.userId==userId][['title','rating']].values.tolist()
    
    def cal_score(self, title, sim, rating, pre_score):
        score = np.linalg.norm([sim, rating/5])
        score = score+pre_score
        return score
    
    def recommend_movies(self, userId, topn=10):
        movies = defaultdict(float)
        interests = self.user_interests(userId)
        seenMovies = set( m for m,s in interests )
        for title, rating in interests:
            neighbors = self.nearest_neighbors_by_m2v(title)
            for (title2, sim) in neighbors:
                if title2 in seenMovies:
                    continue
                pre = movies.get(title2, 0.0)
                movies[title2] = self.cal_score( title2, sim, rating, pre ) 
        return {'recommended' : sorted(movies.items(), key=lambda x: x[1], reverse=True)[:topn]
                ,'ratings':interests}

    ####################################
    ## clustering methods
    def clustering(self, k=100, delta=0.00000001, maxiter=100):
        self.movie_vec = self.model.syn0
        centres, index2cid, dist = kmeans.kmeanssample(self.movie_vec, k, 
                                                       metric = 'cosine', 
                                                       delta = delta, 
                                                       nsample = 0, maxiter = maxiter,)
        self.clustered_ds = pd.DataFrame( [ (a, b, c) for a, b, c in zip(self.model.index2word, index2cid, dist )],
                     columns=['title', 'cid', 'dist'] ).sort(['cid','dist'], ascending=True)
        
        self.movie2cid = { movie:cid for movie,cid in zip(self.model.index2word, index2cid) }
        
        return (centres, index2cid, dist)
    
    def clusterMembersByCId(self, cid):
        return self.clustered_ds[self.clustered_ds.cid==cid]
    
    def clusterMembersByTitle(self, title):
        cid = self.clustered_ds[self.clustered_ds.title==title]['cid'].values[0]
        return self.clusterMembersByCId(cid) 
    
    def clusterTags(self, cid):
        ds = pd.merge(self.tags
                 , self.clusterMembersByCId(cid)
                 , on='title')
        ds['dist'] = 1 - ds['dist']
        cid_tags = ds.groupby(['tag'])['dist'].sum().sort(inplace=False,ascending=False)[:5]
        return cid_tags
    ######################################
    
    def user_mcate_interests(self, uid, v=False):
        interests = self.user_interests(uid)
        cid_interests = defaultdict(int)
        total = 0
        for title, rating in interests:
            cid_interests[self.movie2cid.get(title, -1)] += rating/5
            total += (rating/5)
        interests_ds = pd.DataFrame. from_dict({'score':cid_interests},orient='columns')
        interests_ds['score'] = interests_ds['score'] / total
        interests_ds.sort(['score'], ascending=False, inplace=True)
        
        if True==v:
            for cid, score in interests_ds.iterrows():
                print(cid,score.values[0],cid,self.clusterTags(cid).index.values[:3])
        return interests_ds

In [89]:
## 20m dataset
m2v_rs_ml20 = m2v_recsys('ml-20m')
m2v_rs_ml20.load_model('./resource/m2v_20m_rs.model')

model loaded ./resource/m2v_20m_rs.model


In [90]:
(centres, index2cid, dist ) = m2v_rs_ml20.clustering(k=500, maxiter=200, delta=1e-20)

kmeans: X (5000, 100)  centres (500, 100)  delta=1e-20  maxiter=200  metric=cosine
('kmeans: 24 iterations  cluster sizes:', array([ 8,  0, 12,  3,  0, 10,  0, 17,  7,  1, 12,  0,  6, 35,  4,  7, 13,
       13, 15, 17,  5,  7,  1,  9, 12, 14,  4, 16,  8,  2,  3,  8,  8,  1,
        3,  4,  2, 12, 10,  9, 16, 11,  3,  7,  4,  7,  7, 24,  6,  2,  8,
       20,  7, 19,  3,  2,  2, 16,  0,  2,  8,  3,  4,  0,  0,  6, 27, 19,
        4, 40,  4, 15,  3, 26,  9, 18, 38,  5,  5,  6, 18,  9,  0, 11,  7,
        0, 90,  8,  0,  0, 11,  1, 18, 14,  3,  0, 27,  5,  7, 31, 26,  2,
        4,  7,  6, 13,  8, 10, 34, 13,  2,  4, 13,  0,  0,  2,  5, 10,  6,
        0,  3,  4, 29,  6, 11, 38, 12,  6,  1, 35,  3,  2,  7,  6,  2, 16,
        1, 14,  0,  2,  2,  0, 10,  2, 16,  0, 17,  3, 14,  2, 10,  4, 24,
        7,  8, 10,  1,  5,  8, 12, 18, 10, 24, 73,  2,  2, 13,  6,  4,  9,
       20, 16,  0,  7, 40, 13,  2,  7,  8, 26,  1,  2, 18, 59, 14, 50,  7,
        4,  9, 14,  4, 17, 30,  0,  0, 15,  9,  3,

In [91]:
m2v_rs_ml20.clusterMembersByCId(1)

Unnamed: 0,title,cid,dist
15110,"Man Next Door, The (El hombre de al lado)",1,0.034561
15948,"Die Is Cast, The (La suerte está echada)",1,0.036737


In [92]:
m2v_rs_ml20.clusterMembersByTitle('Ju-on: The Grudge')

Unnamed: 0,title,cid,dist
3881,Dark Water (Honogurai mizu no soko kara),399,0.074214
3549,Ju-on: The Grudge,399,0.090691
3024,Ichi the Killer (Koroshiya 1),399,0.094546
3280,Ginger Snaps,399,0.105488
3070,Dog Soldiers,399,0.110228
4216,High Tension (Haute tension) (Switchblade Roma...,399,0.110287
2567,Day of the Dead,399,0.111094
5259,Ringu 2 (Ring 2),399,0.11557
4950,Cannibal Holocaust,399,0.121644
6597,Undead,399,0.127312


In [93]:
m2v_rs_ml20.clusterMembersByTitle('Pretty Woman')

Unnamed: 0,title,cid,dist
118,There's Something About Mary,90,0.197005
44,Groundhog Day,90,0.254911
72,Four Weddings and a Funeral,90,0.26465
97,Clueless,90,0.28585
179,"American President, The",90,0.309694
69,Titanic,90,0.318048
103,Shakespeare in Love,90,0.322183
157,Austin Powers: International Man of Mystery,90,0.332648
161,American Pie,90,0.33477
73,Sleepless in Seattle,90,0.348089


In [94]:
m2v_rs_ml20.clusterMembersByTitle('Star Wars: Episode IV - A New Hope')

Unnamed: 0,title,cid,dist
33,Saving Private Ryan,199,0.23029
66,L.A. Confidential,199,0.25846
3,"Silence of the Lambs, The",199,0.2765
50,One Flew Over the Cuckoo's Nest,199,0.288146
26,Fight Club,199,0.311616
9,Schindler's List,199,0.314797
29,"Sixth Sense, The",199,0.316103
17,American Beauty,199,0.321594
18,Raiders of the Lost Ark (Indiana Jones and the...,199,0.327116
21,Fargo,199,0.333407


In [95]:
for cid in range(100,150):
    print('cid :', cid, m2v_rs_ml20.clusterTags(cid).index.tolist()[:3])

cid : 100 ['Betamax', 'CLV', 'nudity (topless)']
cid : 101 ['Nazis', 'Oh My Goood!', 'Bakshi']
cid : 102 ['zombies', 'anime', 'Hayao Miyazaki']
cid : 103 ['Criterion', 'dreamlike', 'satirical']
cid : 104 ['dystopia', 'mars', 'violence']
cid : 105 ['relationships', 'melancholic', 'reflective']
cid : 106 ['library', 'based on a book', 'trains']
cid : 107 ['comedy', 'inspirational', 'family']
cid : 108 ['christianity', 'gay', 'based on a book']
cid : 109 ['gothic', 'Scary Movies To See on Halloween', 'Vincent Price']
cid : 110 ['Quentin Tarantino', 'twist ending', 'stand-up comedy']
cid : 111 ['Jennifer Aniston', 'Vince Vaughn', 'horrible finish']
cid : 112 ['family', 'stupid', 'dogs']
cid : 113 ['based on a book', 'sci-fi channel', 'bad adaptation']
cid : 114 ['bud spencer', 'slapstick', 'comedy']
cid : 115 ['zombies', 'movie business']
cid : 116 ['christian', 'computers', 'Cold War']
cid : 117 ['father-son relationship', 'Michael Caine', 'assassin']
cid : 118 ['CLV', 'thriller', 'Betama

In [96]:
m2v_rs_ml20.tags.head(10)

Unnamed: 0,userId,movieId,tag,title,timestamp
0,18,4141,Mark Waters,Head Over Heels,1240597180
1,66193,4141,naive,Head Over Heels,1294446130
2,70201,4141,Mark Waters,Head Over Heels,1308333224
3,89274,4141,Freddie Prinze Jr.,Head Over Heels,1375164095
4,65,208,dark hero,Waterworld,1368150078
5,619,208,Kevin Costner,Waterworld,1187162734
6,2062,208,adventure,Waterworld,1286238272
7,2062,208,dystopia,Waterworld,1286238251
8,2062,208,dystopic future,Waterworld,1286238259
9,2062,208,post-apocalyptic,Waterworld,1286238224


In [97]:
m2v_rs_ml20.clusterMembersByCId(136)

Unnamed: 0,title,cid,dist
8330,Mike's New Car,136,0.045502
6755,Day & Night,136,0.050482
5030,"Secret of Kells, The",136,0.055877
5817,Summer Wars (Samâ wôzu),136,0.056854
4801,"Illusionist, The (L'illusionniste)",136,0.06439
9494,Balance,136,0.067177
5420,Vincent,136,0.068869
7468,"Town Called Panic, A (Panique au village)",136,0.073623
9379,My Neighbors the Yamadas (Hôhokekyo tonari no ...,136,0.074655
4389,BURN-E,136,0.078547


In [98]:
m2v_rs_ml20.clusterMembersByCId(90)

Unnamed: 0,title,cid,dist
118,There's Something About Mary,90,0.197005
44,Groundhog Day,90,0.254911
72,Four Weddings and a Funeral,90,0.26465
97,Clueless,90,0.28585
179,"American President, The",90,0.309694
69,Titanic,90,0.318048
103,Shakespeare in Love,90,0.322183
157,Austin Powers: International Man of Mystery,90,0.332648
161,American Pie,90,0.33477
73,Sleepless in Seattle,90,0.348089


In [99]:
m2v_rs_ml20.clusterMembersByTitle('Matrix, The')

Unnamed: 0,title,cid,dist
33,Saving Private Ryan,199,0.23029
66,L.A. Confidential,199,0.25846
3,"Silence of the Lambs, The",199,0.2765
50,One Flew Over the Cuckoo's Nest,199,0.288146
26,Fight Club,199,0.311616
9,Schindler's List,199,0.314797
29,"Sixth Sense, The",199,0.316103
17,American Beauty,199,0.321594
18,Raiders of the Lost Ark (Indiana Jones and the...,199,0.327116
21,Fargo,199,0.333407


In [100]:
m2v_rs_ml20.clusterTags(81)

tag
multiple storylines    33.120013
suspenseful            20.704029
Al Pacino              20.539514
drugs                  19.864357
horror                 19.000814
Name: dist, dtype: float64

In [101]:
interests = m2v_rs_ml20.user_interests(10)
cid_interests = defaultdict(int)
total = 0
for title, rating in interests:
    cid_interests[m2v_rs_ml20.movie2cid.get(title, -1)] += rating/5
    total += (rating/5)
interests_ds = pd.DataFrame. from_dict({'score':cid_interests},orient='columns')
interests_ds['score'] = (interests_ds['score'] / total)
interests_ds.sort(['score'], ascending=False, inplace=True)

#for cid in cid_interests.keys():
#    cid_interests[cid] = round(cid_interests[cid]/total,5)

In [102]:
interests_ds

Unnamed: 0,score
277,0.222973
199,0.195946
297,0.114865
14,0.074324
90,0.054054
220,0.054054
74,0.047297
286,0.047297
217,0.040541
51,0.027027


In [105]:
m2v_rs_ml20.user_mcate_interests(20)

Unnamed: 0,score
297,0.140625
277,0.135417
98,0.083333
358,0.078125
220,0.072917
90,0.067708
245,0.057292
307,0.052083
14,0.041667
199,0.041667


In [106]:
interests = m2v_rs_ml20.user_mcate_interests(100, v=True)

199 0.159340659341 199 ['twist ending' 'dark comedy' 'psychology']
286 0.126373626374 286 ['ensemble cast' 'father-son relationship' 'Shakespeare']
453 0.0934065934066 453 ['action' 'Jim Carrey' 'sci-fi']
90 0.0879120879121 90 ['comedy' 'surreal' 'sci-fi']
51 0.0714285714286 51 ['quirky' 'Woody Allen' 'politics']
358 0.0604395604396 358 ['dark comedy' 'dystopia' 'quirky']
494 0.0494505494505 494 ['surreal' 'thought-provoking' 'psychology']
366 0.0494505494505 366 ['time travel' 'twist ending' 'Brad Pitt']
381 0.043956043956 381 ['women' 'CLV' 'dramatic']
154 0.0384615384615 154 ['thriller' 'inspirational' 'detective']
9 0.0274725274725 9 ['atmospheric' 'dystopia' 'dreamlike']
220 0.021978021978 220 ['Pixar' 'animation' 'superhero']
337 0.021978021978 337 ['fantasy' 'dragon' 'sci-fi']
125 0.021978021978 125 ['nudity (topless)' 'CLV' 'Nudity (Topless)']
98 0.021978021978 98 ['time travel' 'Jim Carrey' 'mars']
489 0.021978021978 489 ['Star Trek' 'family' 'teen movie']
214 0.0164835164835 

In [107]:
for cid, score in interests.iterrows():
    print(cid,score.values[0],cid,m2v_rs_ml20.clusterTags(cid).index.values[:3])

199 0.159340659341 199 ['twist ending' 'dark comedy' 'psychology']
286 0.126373626374 286 ['ensemble cast' 'father-son relationship' 'Shakespeare']
453 0.0934065934066 453 ['action' 'Jim Carrey' 'sci-fi']
90 0.0879120879121 90 ['comedy' 'surreal' 'sci-fi']
51 0.0714285714286 51 ['quirky' 'Woody Allen' 'politics']
358 0.0604395604396 358 ['dark comedy' 'dystopia' 'quirky']
494 0.0494505494505 494 ['surreal' 'thought-provoking' 'psychology']
366 0.0494505494505 366 ['time travel' 'twist ending' 'Brad Pitt']
381 0.043956043956 381 ['women' 'CLV' 'dramatic']
154 0.0384615384615 154 ['thriller' 'inspirational' 'detective']
9 0.0274725274725 9 ['atmospheric' 'dystopia' 'dreamlike']
220 0.021978021978 220 ['Pixar' 'animation' 'superhero']
337 0.021978021978 337 ['fantasy' 'dragon' 'sci-fi']
125 0.021978021978 125 ['nudity (topless)' 'CLV' 'Nudity (Topless)']
98 0.021978021978 98 ['time travel' 'Jim Carrey' 'mars']
489 0.021978021978 489 ['Star Trek' 'family' 'teen movie']
214 0.0164835164835 