## word2vec으로 추천 시스템 구현하기 

In [311]:
from gensim.models import Word2Vec
import pandas as pd
from collections import defaultdict
import numpy as np

In [351]:
## string to int with ignore none disit
def toint(s,default=0):
    converted = default
    try :
        converted = int(s)
    except:
        #print('string convert exception "{0}" (default={1})"'.format(s , default))
        pass
    return converted

class m2v_recsys:
    model = None
    home_dir = '/Users/goodvc/Documents/data-analytics/movie-recommendation/'
    w2v_env = { 'min_count':5, 'size':100, 'window':5 }
    nn_func = None

    ## initialize
    def __init__(self, ds_type='ml-latest-small'):
        self.ds_type = ds_type
        self.movieLensDataLoad()

    
    ## dataset load
    def movieLensDataLoad(self, ds_type=None):
        if ds_type != None:
            self.ds_type = ds_type
        ## user 영화 별점 data 
        self.ratings = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='ratings.csv'))

        ## movie meta(타이트,장르) data 
        self.movies = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='movies.csv'))
        ## split title and release year . ex) Nixon (1995) => Nixon , 1995
        self.movies['year'] = self.movies['title'].apply(lambda x: toint(x.strip()[-5:-1], 1950))
        self.movies['title'] = self.movies['title'].apply(lambda x: x[:-7].strip())
        
        ## user가 영화에 tag를 기입한 data
        self.tags = pd.read_csv('{home}/{type}/{name}'.format( home=self.home_dir, type=self.ds_type, name='tags.csv'))

        ## join ratings and movies by movieId 
        self.ratings_movie = pd.merge( self.ratings, self.movies, on='movieId' )
        
    def make_sentences(self, threshold=3):
        ## corpus를 만듦
        corpus = []
        grouped = self.ratings_movie[self.ratings_movie.rating<threshold].groupby(['userId'])
        for idx, user in grouped:
            corpus.append (user.sort(['timestamp'])['title'].values)
        return corpus
        
    def load_model(self, model_path):
        self.model = Word2Vec.load(model_path)
        print('model loaded', model_path)
    def save_model(self, model_path):
        self.model.save(model_path)
        
    def training(self):
        if self.model == None :
            self.scentences = self.make_sentences()
            self.model = Word2Vec(self.scentences, 
                                  min_count = self.w2v_env['min_count'],
                                  size = self.w2v_env['size'], 
                                  window = self.w2v_env['window'])
            print('model trained')
            
    def nearest_neighbors_by_m2v(self, target, topn=10):
        if str == type(target):
            target=[target]
        result = []
        try:
            result = self.model.most_similar(positive=target, topn=topn)
        except:
            pass
        return result

    def user_interests(self, userId):
        return self.ratings_movie[self.ratings_movie.userId==userId][['title','rating']].values.tolist()
    
    def cal_score(self, title, sim, rating, pre_score):
        score = np.linalg.norm([sim, rating/5])
        score = score+pre_score
        return score
    
    def recommend_movies(self, userId, topn=10):
        movies = defaultdict(float)
        interests = self.user_interests(userId)
        seenMovies = set( m for m,s in interests )
        for title, rating in interests:
            neighbors = self.nearest_neighbors_by_m2v(title)
            for (title2, sim) in neighbors:
                if title2 in seenMovies:
                    continue
                pre = movies.get(title2, 0.0)
                movies[title2] = self.cal_score( title2, sim, rating, pre ) 
        
        
        return {'recommended' : sorted(movies.items(), key=lambda x: x[1], reverse=True)[:topn]
                ,'ratings':interests}

In [366]:
## 객채생성
m2v_rs = m2v_recsys('ml-latest-small')

In [367]:
## training word2vec by dataset  
m2v_rs.training()

model trained


In [368]:
## 현재 모델 save 
m2v_rs.save_model('./resource/m2v_rs.model')

In [369]:
## model load by stored model 
m2v_rs.load_model('./resource/m2v_rs.model')

model loaded ./resource/m2v_rs.model


In [370]:
## 유사한 영화 
m2v_rs.nearest_neighbors_by_m2v(['Pretty Woman'],topn=20)

[('Ace Ventura: Pet Detective', 0.8308584094047546),
 ('Batman Forever', 0.8251057863235474),
 ('Nutty Professor, The', 0.8134173154830933),
 ('Mask, The', 0.7993110418319702),
 ('Independence Day (a.k.a. ID4)', 0.7835349440574646),
 ('Waterworld', 0.7380474805831909),
 ('Cliffhanger', 0.732891321182251),
 ('Lost in Space', 0.7325952053070068),
 ('Addams Family Values', 0.7318918704986572),
 ('Home Alone', 0.7281185388565063),
 ('Armageddon', 0.7230995893478394),
 ('Lara Croft: Tomb Raider', 0.721030592918396),
 ('Star Wars: Episode I - The Phantom Menace', 0.7173625826835632),
 ('Mars Attacks!', 0.7150463461875916),
 ('Babe', 0.7143656015396118),
 ('Ace Ventura: When Nature Calls', 0.7105658054351807),
 ("Charlie's Angels", 0.7103386521339417),
 ('Legally Blonde', 0.7098460793495178),
 ('Die Hard: With a Vengeance', 0.7090703248977661),
 ('Beauty and the Beast', 0.7073931097984314)]

In [377]:
ret = m2v_rs.recommend_movies(100,topn=10)
print('------------------------------')
print('* 추천영화\n', ret['recommended'])
print('------------------------------')
print('* 별점 매긴영화\n', ret['ratings'])

------------------------------
* 추천영화
 [('Mask, The', 40.084993085057235), ('Star Wars: Episode I - The Phantom Menace', 25.55198952102576), ('American Pie', 23.479421816895698), ('Armageddon', 21.941650937977695), ('Pretty Woman', 21.02547820191867), ('Home Alone', 17.389201084881986), ('Addams Family Values', 14.500399027860951), ('Cliffhanger', 11.966764784107635), ('Babe', 11.081211202166184), ('Lost in Space', 11.056676336017237)]
------------------------------
* 별점 매긴영화
 [['Heat', 4.0], ['Twelve Monkeys (a.k.a. 12 Monkeys)', 5.0], ['Usual Suspects, The', 4.0], ['Braveheart', 3.0], ['Strange Days', 3.0], ['Star Wars: Episode IV - A New Hope', 4.0], ['Pulp Fiction', 5.0], ['Shawshank Redemption, The', 4.0], ["Carlito's Way", 3.0], ["Schindler's List", 5.0], ['Dances with Wolves', 4.0], ['Batman', 3.0], ['Fargo', 5.0], ['Mission: Impossible', 4.0], ['Independence Day (a.k.a. ID4)', 3.0], ['Last Man Standing', 4.0], ['2 Days in the Valley', 4.0], ['Star Trek: First Contact', 3.0], ['

## 2000만건 데이터셋으로 학습하기 

In [None]:
"""
m2v_rs = m2v_recsys('ml-20m')
m2v_rs.training()
m2v_rs.save_model('./resource/m2v_rs.ml20.model')
"""

In [352]:
m2v_rs_ml20 = m2v_recsys('ml-20m')
m2v_rs_ml20.load_model('./resource/m2v_rs.ml20.model')

model loaded ./resource/m2v_rs.ml20.model


In [353]:
## 유사한 영화 ml-20m기준 
m2v_rs_ml20.nearest_neighbors_by_m2v(['Pretty Woman'], topn=10)

[('Mrs. Doubtfire', 0.7438002824783325),
 ('Mask, The', 0.7124621272087097),
 ('Sleepless in Seattle', 0.7078831195831299),
 ('Ghost', 0.6981393694877625),
 ('Babe', 0.6647126078605652),
 ('Four Weddings and a Funeral', 0.6088311672210693),
 ('Lion King, The', 0.5941227674484253),
 ('Titanic', 0.5748208165168762),
 ('Firm, The', 0.5606018304824829),
 ('Outbreak', 0.5603909492492676)]

In [354]:
print('unique userid cnt',len(m2v_rs_ml20.ratings['userId'].unique()))

unique userid cnt 138493


In [361]:
ret = m2v_rs_ml20.recommend_movies(100)
print('추천영화\n', ret['recommended'])
print('**********************************')
print('별점 매긴영화\n', ret['ratings'])

추천영화
 [('Braveheart', 9.81596048724656), ('Piano, The', 7.3276034583878431), ('Firm, The', 6.7822411937512292), ('Outbreak', 5.9857618788520757), ('Apollo 13', 5.7976622045011235), ('Farewell My Concubine (Ba wang bie ji)', 5.5339653096944454), ('Remains of the Day, The', 5.4431525602738331), ('Taxi Driver', 5.4017879480583648), ('Bullets Over Broadway', 5.2129741615347776), ('Matrix, The', 5.0703639046479783)]
**********************************
별점 매긴영화
 [['Twelve Monkeys (a.k.a. 12 Monkeys)', 3.0], ['Usual Suspects, The', 5.0], ['Clerks', 4.0], ['Star Wars: Episode IV - A New Hope', 4.0], ['Léon: The Professional (a.k.a. The Professional) (Léon)', 5.0], ['Pulp Fiction', 4.0], ['Shawshank Redemption, The', 3.0], ["What's Eating Gilbert Grape", 3.0], ['Terminator 2: Judgment Day', 3.0], ['Silence of the Lambs, The', 4.0], ['From Dusk Till Dawn', 3.0], ['Star Wars: Episode VI - Return of the Jedi', 4.0], ['Star Trek: Generations', 3.0], ['Heavy Metal', 4.0], ['Independence Day (a.k.a. ID