## word2vec으로 추천 시스템 구현하기 

In [22]:
from gensim.models import Word2Vec
import pandas as pd
from collections import defaultdict
import numpy as np
from __future__ import print_function

In [2]:
## string to int with ignore none disit
def toint(s,default=0):
    converted = default
    try :
        converted = int(s)
    except:
        #print('string convert exception "{0}" (default={1})"'.format(s , default))
        pass
    return converted

class m2v_recsys:
    model = None
    home_dir = '/Users/goodvc/Documents/data-analytics/movie-recommendation/'
    w2v_env = { 'min_count':5, 'size':100, 'window':5 }
    nn_func = None

    ## initialize
    def __init__(self, ds_type='ml-latest-small'):
        self.ds_type = ds_type
        self.movieLensDataLoad()

    ## dataset load
    def movieLensDataLoad(self, ds_type=None):
        if ds_type != None:
            self.ds_type = ds_type
        ## user 영화 별점 data 
        self.ratings = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='ratings.csv'))

        ## movie meta(타이트,장르) data 
        self.movies = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='movies.csv'))
        ## split title and release year . ex) Nixon (1995) => Nixon , 1995
        self.movies['year'] = self.movies['title'].apply(lambda x: toint(x.strip()[-5:-1], 1950))
        self.movies['title'] = self.movies['title'].apply(lambda x: x[:-7].strip())
        
        ## user가 영화에 tag를 기입한 data
        self.tags = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='tags.csv'))

        ## join ratings and movies by movieId 
        self.ratings_movie = pd.merge( self.ratings, self.movies, on='movieId' )
        
    def make_sentences(self, threshold=3):
        ## corpus를 만듦
        corpus = []
        grouped = self.ratings_movie[self.ratings_movie.rating>=threshold].groupby(['userId'])
        for idx, user in grouped:
            corpus.append (user.sort(['timestamp'])['title'].tolist())
        return corpus
        
    def load_model(self, model_path):
        self.model = Word2Vec.load(model_path)
        print('model loaded', model_path)
        
    def save_model(self, model_path):
        self.model.save(model_path)
        
    def training(self):
        if self.model == None :
            self.scentences = self.make_sentences()
            self.model = Word2Vec(self.scentences, 
                                  min_count = self.w2v_env['min_count'],
                                  size = self.w2v_env['size'], 
                                  window = self.w2v_env['window'])
            print('model trained')
            
    def nearest_neighbors_by_m2v(self, target, topn=10):
        if str == type(target):
            target=[target]
        result = []
        try:
            result = self.model.most_similar(positive=target, topn=topn)
        except:
            pass
        return result

    def user_interests(self, userId):
        return self.ratings_movie[self.ratings_movie.userId==userId][['title','rating']].values.tolist()
    
    def cal_score(self, title, sim, rating, pre_score):
        score = np.linalg.norm([sim, rating/5])
        score = score+pre_score
        return score
    
    def recommend_movies(self, userId, topn=10):
        movies = defaultdict(float)
        interests = self.user_interests(userId)
        seenMovies = set( m for m,s in interests )
        for title, rating in interests:
            neighbors = self.nearest_neighbors_by_m2v(title)
            for (title2, sim) in neighbors:
                if title2 in seenMovies:
                    continue
                pre = movies.get(title2, 0.0)
                movies[title2] = self.cal_score( title2, sim, rating, pre ) 
        
        
        return {'recommended' : sorted(movies.items(), key=lambda x: x[1], reverse=True)[:topn]
                ,'ratings':interests}

In [26]:
## 객채생성
#m2v_rs = m2v_recsys('ml-20m')
m2v_rs = m2v_recsys('ml-latest-small')

In [27]:
## training word2vec by dataset  
m2v_rs.training()

model trained


In [28]:
## 현재 모델 save 
m2v_rs.save_model('./resource/m2v_small_rs.model')

In [29]:
## model load by stored model 
m2v_rs.load_model('./resource/m2v_small_rs.model')

('model loaded', './resource/m2v_small_rs.model')


In [30]:
## 유사한 영화 
m2v_rs.nearest_neighbors_by_m2v(['Pretty Woman'],topn=20)

[('Ghost', 0.9997854232788086),
 ('Disclosure', 0.9996465444564819),
 ('Get Shorty', 0.9988297820091248),
 ('Sleepless in Seattle', 0.998803973197937),
 ('Clueless', 0.9983817934989929),
 ('Four Weddings and a Funeral', 0.9983331561088562),
 ('Babe', 0.9982548952102661),
 ('Santa Clause, The', 0.9975646138191223),
 ('Legends of the Fall', 0.9975560903549194),
 ('Addams Family Values', 0.9973928332328796),
 ('Mask, The', 0.9971521496772766),
 ('Mrs. Doubtfire', 0.997077465057373),
 ('Saving Private Ryan', 0.9968827962875366),
 ('Ace Ventura: When Nature Calls', 0.9968212842941284),
 ('Birdcage, The', 0.9964831471443176),
 ('Beverly Hills Cop III', 0.9964747428894043),
 ('E.T. the Extra-Terrestrial', 0.9964015483856201),
 ('Usual Suspects, The', 0.9961400628089905),
 ('Broken Arrow', 0.9959720373153687),
 ('Men in Black (a.k.a. MIB)', 0.9957752227783203)]

In [31]:
ret = m2v_rs.recommend_movies(10,topn=10)
print('------------------------------')
print('* 추천영화\n', ret['recommended'])
print('------------------------------')
print('* 별점 매긴영화\n', ret['ratings'])

------------------------------
* 추천영화
 [('Hamlet', 50.895365502762331), ("William Shakespeare's Romeo + Juliet", 42.190276807646711), ('Thomas Crown Affair, The', 39.869563068839646), ('King Kong', 39.088502755655455), ('Truman Show, The', 33.825384453613552), ('Children of Men', 28.040616273577253), ('Jungle Book, The', 24.47952503543377), ('28 Days Later', 23.896016834401067), ('Little Miss Sunshine', 23.503357292616609), ('Who Framed Roger Rabbit?', 22.670114185988822)]
------------------------------
* 별점 매긴영화
 [['Usual Suspects, The', 5.0], ['Braveheart', 3.5], ['Star Wars: Episode IV - A New Hope', 2.5], ['Pulp Fiction', 4.0], ['Shawshank Redemption, The', 3.5], ["Schindler's List", 4.0], ['Blade Runner', 4.0], ['Tombstone', 4.0], ['Terminator 2: Judgment Day', 3.5], ['Batman', 4.0], ['Silence of the Lambs, The', 3.5], ['Fargo', 3.0], ['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb', 4.5], ['Godfather, The', 4.5], ['Maltese Falcon, The', 3.5], ['Reservoir Do

## 2000만건 데이터셋으로 학습하기 

In [32]:
"""
m2v_rs = m2v_recsys('ml-20m')
m2v_rs.training()
m2v_rs.save_model('./resource/m2v_20m_rs.model')
"""

"\nm2v_rs = m2v_recsys('ml-20m')\nm2v_rs.training()\nm2v_rs.save_model('./resource/m2v_20m_rs.model')\n"

In [37]:
m2v_rs_ml20 = m2v_recsys('ml-20m')
m2v_rs_ml20.load_model('./resource/m2v_20m_rs.model')

('model loaded', './resource/m2v_20m_rs.model')


In [38]:
## 유사한 영화 ml-20m기준 
m2v_rs_ml20.nearest_neighbors_by_m2v(['Pretty Woman'], topn=10)

[('Mrs. Doubtfire', 0.6727120876312256),
 ('Four Weddings and a Funeral', 0.6044284701347351),
 ('Sleepless in Seattle', 0.5985074043273926),
 ('Ghost', 0.5417937636375427),
 ('Clueless', 0.5165965557098389),
 ('Titanic', 0.5129855871200562),
 ('Mask, The', 0.5024983882904053),
 ('Dave', 0.4821922779083252),
 ('American President, The', 0.4737176299095154),
 ('Dumb & Dumber (Dumb and Dumber)', 0.47182410955429077)]

In [35]:
print('unique userid cnt',len(m2v_rs_ml20.ratings['userId'].unique()))

unique userid cnt 138493


In [36]:
ret = m2v_rs_ml20.recommend_movies(100)
print('추천영화\n', ret['recommended'])
print('**********************************')
print('별점 매긴영화\n', ret['ratings'])

추천영화
 [('Seven (a.k.a. Se7en)', 7.2260491114418031), ('Saving Private Ryan', 7.200722952068018), ('Taxi Driver', 6.0107027938286741), ('Goodfellas', 5.1691446122487887), ('Flirting With Disaster', 4.1941259035653848), ('Waterworld', 4.092605050917566), ('Outbreak', 4.0780340695081598), ('Piano, The', 3.8825274359053941), ('Much Ado About Nothing', 3.805156619895774), ("Nobody's Fool", 3.5656547719958649)]
**********************************
별점 매긴영화
 [['Twelve Monkeys (a.k.a. 12 Monkeys)', 3.0], ['Usual Suspects, The', 5.0], ['Clerks', 4.0], ['Star Wars: Episode IV - A New Hope', 4.0], ['L\xc3\xa9on: The Professional (a.k.a. The Professional) (L\xc3\xa9on)', 5.0], ['Pulp Fiction', 4.0], ['Shawshank Redemption, The', 3.0], ["What's Eating Gilbert Grape", 3.0], ['Terminator 2: Judgment Day', 3.0], ['Silence of the Lambs, The', 4.0], ['From Dusk Till Dawn', 3.0], ['Star Wars: Episode VI - Return of the Jedi', 4.0], ['Star Trek: Generations', 3.0], ['Heavy Metal', 4.0], ['Independence Day (a