In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import ast

In [2]:
class movie_recommendation:
    def __init__(self, **kargs):
        self.topn = kargs.get('topn', 10)
        self.vote_thres = kargs.get('vote_thres', 100)
        self.df = kargs.get('data', pd.read_pickle('./dataset/movies04293.pickle'))
        self.a, self.b, self.c = kargs.get('a',0.8), kargs.get('b',0.1), kargs.get('c',0.1)
        self.verbose = kargs.get('verbose', 1)
        
        self.cvec = CountVectorizer(min_df=0, ngram_range=(1,2))
        self.model = Word2Vec.load('./model/w2v_movie_plot.model')
        self.scaler = MinMaxScaler()
        
        if self.verbose == 1:
            print('-'*35)
            print('# Parameters')
            print('      a, b, c        : {0}, {1}, {2}'.format(self.a, self.b, self.c))
            print('vote count threshold :', self.vote_thres)
            print('weighted_sum = keywords*{0}(a) + genre*{1}(b) + weighted vote*{2}(c)'.format(self.a, self.b, self.c))
            print('-'*35)
        
    def search_title(self, title_name):
        return self.df[self.df['title'].str.contains(title_name)].title   
    
    def genre_sim_sorted(self, title_idx):
        genre_literal = self.df['genre'].apply(lambda x: x.replace('|',' '))
        title_genre = self.df.loc[title_idx, 'genre'].replace('|', ' ')
        
        genre = self.cvec.fit_transform(genre_literal)
        title_vec = self.cvec.transform([title_genre])
        
        genre_sim = cosine_similarity(genre,title_vec)
        
        return np.array([(idx,sim) for idx,sim in enumerate(genre_sim)])
           
    def cos_sim(self, corp1, corp2):
        vec1, vec2 = [], []
        for word1, word2 in zip(corp1,corp2):
            vec1.append(self.model.wv[word1])
            vec2.append(self.model.wv[word2])

        vec1, vec2 = np.array(vec1).mean(axis=0), np.array(vec2).mean(axis=0)
        return np.inner(vec1,vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))

    def similar_keywords_movies(self, title_idx):
        self.df['keywords_literal'] = self.df['keywords'].apply(lambda x: ' '.join(x))
        keywords_src = self.df.loc[title_idx,'keywords']
        keywords_sims = []

        for row in self.df.itertuples():
            keywords_tgt = row.keywords
            keywords_sims.append(self.cos_sim(keywords_src, keywords_tgt))

        df_with_ksim = self.df.copy()
        df_with_ksim['keywords_sim'] = keywords_sims
        df_with_ksim = df_with_ksim[df_with_ksim['vote_count'] > self.vote_thres]
        
        return df_with_ksim.sort_values('keywords_sim',ascending=False)[1:]

    def result_by_weights(self, dataf):
        dataf['weighted_sum'] = dataf['keywords_sim_scaled']*self.a + dataf['genre_scaled']*self.b + dataf['wvote_scaled']*self.c
        
        return dataf.sort_values('weighted_sum', ascending=False)
    
    def getMovies(self, title):
        # no title result
        try: title_idx = self.df[self.df['title']== title].index.values[0]
        except:
            raise ValueError('There is no such title name. Search with "search_title" function')
        
        # get movies
        result = self.similar_keywords_movies(title_idx)

        # IMDB's weighted_vote
        def weighted_vote_average(record):
            v, r = record['vote_count'], record['rating']
            return (v/(v+m))*r + (m/(m+v))*c
        c = result['rating'].mean()
        m = result['vote_count'].quantile(.6)
        result['weighted_vote'] = result.apply(weighted_vote_average,axis=1)
        
        # merge with genre
        genre_sim = self.genre_sim_sorted(title_idx)
        result_with_genre = pd.merge(result, pd.Series(genre_sim[:,1], name='genre_sim'), left_on=result.index, right_on=genre_sim[:,0],)
             
        # minmax scale
        result_with_genre['keywords_sim_scaled'] = MinMaxScaler().fit_transform(result_with_genre['keywords_sim'].values.reshape(-1,1))
        result_with_genre['wvote_scaled'] = MinMaxScaler().fit_transform(result_with_genre['weighted_vote'].values.reshape(-1,1))
        result_with_genre['genre_scaled'] = MinMaxScaler().fit_transform(result_with_genre['genre_sim'].values.reshape(-1,1))
        
        # (optional)remove data genre score is 0
        no_genre_score_idx = result_with_genre[result_with_genre['genre_sim'] == 0].index
        result_with_genre.drop(no_genre_score_idx, inplace=True)
        
        result_with_genre = self.result_by_weights(result_with_genre)
        return result_with_genre.head(self.topn)
    

In [3]:
recom = movie_recommendation()

-----------------------------------
# Parameters
      a, b, c        : 0.8, 0.1, 0.1
vote count threshold : 100
weighted_sum = keywords*0.8(a) + genre*0.1(b) + weighted vote*0.1(c)
-----------------------------------


In [31]:
result = recom.getMovies(title='인비저블맨')

In [32]:
result[['weighted_sum','title', 'keywords_literal', 'keywords_sim_scaled', 'genre_scaled', 'wvote_scaled']]

Unnamed: 0,weighted_sum,title,keywords_literal,keywords_sim_scaled,genre_scaled,wvote_scaled
4,0.929361,노 원 리브스,실종 상속 녀 눈빛 갱단 이유 불안 여자 매너 남자,0.94818,1.0,0.70817
0,0.901247,오리지널 씬,루이스 거액 도피 거짓 사실 살인 줄리아 사랑 한순간 아내,1.0,0.258199,0.754269
5,0.887021,살인재능,살인 결혼 홧김 멸시 실업자 남자 친구 밤낮 상황 대리,0.947381,0.57735,0.713805
3,0.870298,그 남자는 거기 없었다,에드 도리스 사기 결과 직장 전해 투자 레이첼 딸 친구 상사,0.950583,0.333333,0.764981
2,0.867419,폴리스 스토리 2014,충격 딸 클럽 남자친구 이자 반장 주인 도중 인질 협상,0.960017,0.218218,0.775832
11,0.866877,더 로드,인생 아기 여인 친척 집으로 로드 사인 프랭크 선택 죽음,0.923125,0.516398,0.767369
30,0.846231,온다,생활 전화 의문 현실 아내 딸 표적 행복 결혼 미스터리,0.894454,0.57735,0.729335
147,0.845226,오픈 더 도어,실종 시작 아들 이고 또래 르와 충격 사랑 아내 폴리,0.843831,1.0,0.701612
43,0.843607,검은 집,자살 아버지 보험금 살인 엄마 죽음 정원 황정민 인가 살인자 대결 집,0.881646,0.774597,0.608299
52,0.835656,굿 라이어,의지 로이 베 제안 온라인 본인 공동 계좌 대로 재산,0.878901,0.57735,0.748006
