In [133]:
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
from ast import literal_eval # 문자열 모형의 딕트를 스근하게 딕트로 바꾸어 준다. 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [135]:
md = pd.read_csv('movieRecommaned/input/movies_metadata.csv')

In [134]:
md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [136]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [137]:
md['genres'] = md['genres'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [138]:
md['year'] = md['release_date'].apply(lambda x:str(x).split('-')[0] if x != np.nan else np.nan)

In [139]:
vote_count = md['vote_count']
vote_average = md['vote_average']
m = vote_count.quantile(0.95)
c = vote_average.mean()
c

5.618207215134185

In [140]:
def get_wr(x):
    v = x['vote_count']
    r = x['vote_average']
    return (v / (v + m) * r) + (m/(v+m)*c)


In [141]:

qualified = md[(md['vote_count'].notnull()) & (md['vote_average'].notnull()) & (md['vote_count'] >= m)][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype(int)
qualified['vote_average'] = qualified['vote_average'].astype(int)

In [142]:
qualified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2274 entries, 0 to 45014
Data columns (total 6 columns):
title           2274 non-null object
year            2274 non-null object
vote_count      2274 non-null int32
vote_average    2274 non-null int32
popularity      2274 non-null object
genres          2274 non-null object
dtypes: int32(2), object(4)
memory usage: 106.6+ KB


In [143]:
qualified['wr'] = qualified.apply(get_wr, axis=1)

In [144]:
qualified = qualified.sort_values('wr', ascending=False)

In [145]:
s_gen = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)

In [146]:
s_gen.name = 'genres' 
s_gen = md.drop('genres', axis=1).join(s_gen)

In [147]:
def bar_chart(genres, percent=0.85):
    df = s_gen[s_gen['genres'] == genres]
    vote_count = df[df['vote_count'].notnull()]['vote_count']
    vote_average = df[df['vote_average'].notnull()]['vote_average']
    m = vote_count.quantile(percent)
    c = vote_average.mean()
    qualified = df[(df['vote_count'].notnull()) & (df['vote_count'] >= m) & df['vote_average'].notnull()][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype(int)
    qualified['vote_average'] = qualified['vote_average'].astype(int)
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * c), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    return qualified

In [148]:
bar_chart('Adventure').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.920697
22879,Interstellar,2014,11187,8,32.2135,7.901099
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.87693
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.867537
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.858018
256,Star Wars,1977,6778,8,42.1497,7.841181
1225,Back to the Future,1985,6239,8,25.7785,7.828477
1154,The Empire Strikes Back,1980,5998,8,19.471,7.822115
5481,Spirited Away,2001,3968,8,41.0489,7.741285
9698,Howl's Moving Castle,2004,2049,8,16.136,7.546475


In [149]:
bar_chart('Romance').head(3)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.613461
351,Forrest Gump,1994,8147,8,48.3072,7.975754
876,Vertigo,1958,1162,8,18.2082,7.840579


In [150]:
bar_chart('Thriller').head(3)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.962878
12481,The Dark Knight,2008,12269,8,123.167,7.957511
292,Pulp Fiction,1994,8670,8,140.95,7.940315


In [151]:
link_small = pd.read_csv('movieRecommaned/input/links_small.csv')
link_small = link_small[link_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [152]:
md = md.drop([19730, 29503, 35587])

In [153]:
md['id'] = md['id'].astype('int')

In [154]:
smd = md[md['id'].isin(link_small)]

In [155]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [156]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [157]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [158]:
smd = smd.reset_index()

In [159]:
titles = smd['title']
indces = pd.Series(smd.index, index=titles)

In [160]:
def getrecommandations(title):
    index = indces[title]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores] 
    return titles.iloc[movie_indices]

In [161]:
getrecommandations('The Dark Knight')

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
7242                  The File on Thelma Jordon
3537                               Criminal Law
2893                              Flying Tigers
1135                   Night Falls on Manhattan
8680                          The Young Savages
8917         Batman v Superman: Dawn of 

In [162]:
credits = pd.read_csv('movieRecommaned/input/credits.csv')
keywords = pd.read_csv('movieRecommaned/input/keywords.csv')

In [163]:
keywords['id'] = keywords['id'].astype(int)
credits['id'] = credits['id'].astype(int)
md['id'] = md['id'].astype(int)

In [164]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [165]:
smd = md[md['id'].isin(link_small)]

In [166]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x :len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [167]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [168]:
smd['director'] = smd['crew'].apply(get_director)

In [169]:
smd['cast'] = smd['cast'].apply(lambda x : [i['name'] for i in x] if isinstance(x, list) else [])

In [170]:
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) > 3 else x)

In [171]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [172]:
smd['cast'] = smd['cast'].apply(lambda x : [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype(str).apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

In [173]:
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)

In [174]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
dtype: int64

In [175]:
s = s[s > 1]


In [176]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [177]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [178]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)

In [179]:
smd['keywords'] = smd['keywords'].apply(lambda x :[stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [180]:
smd['soup'] = smd['keywords']+smd['cast']+smd['director']+smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [181]:
count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [182]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [183]:
smd = smd.reset_index()
titles = smd['title']
indces = pd.Series(smd.index, index=smd['title'])

In [184]:
indces

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
Heat                                                     5
Sabrina                                                  6
Tom and Huck                                             7
Sudden Death                                             8
GoldenEye                                                9
The American President                                  10
Dracula: Dead and Loving It                             11
Balto                                                   12
Nixon                                                   13
Cutthroat Island                                        14
Casino                                                  15
Sense and Sensibility                             

In [185]:
getrecommandations('The Dark Knight')

8031                 The Dark Knight Rises
6218                         Batman Begins
6623                          The Prestige
2085                             Following
7648                             Inception
4145                              Insomnia
3381                               Memento
8613                          Interstellar
7659            Batman: Under the Red Hood
1134                        Batman Returns
8927               Kidnapping Mr. Heineken
5943                              Thursday
1260                        Batman & Robin
9024    Batman v Superman: Dawn of Justice
4021                  The Long Good Friday
5809                           Point Blank
7362       Gangster's Paradise: Jerusalema
7561                           Harry Brown
7582                              Defendor
8001                      Batman: Year One
2754                          Death Wish 3
132                         Batman Forever
2131                              Superman
2448       

In [186]:
def improved_recommendations(title):
    index = indces[title]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indecs = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indecs][['title', 'vote_count', 'vote_average', 'year']]
    vote_count = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_average = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    c = vote_average.mean()
    m = vote_count.quantile(0.6)
    qualified = movies[(movies['vote_count'].notnull()) & (movies['vote_average'].notnull()) & (movies['vote_count']>m)]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(get_wr, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [187]:
reader = Reader()

In [188]:
raitings = pd.read_csv('movieRecommaned/input/ratings_small.csv')
raitings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [189]:
data = Dataset.load_from_df(raitings[['userId', 'movieId', 'rating']], reader=reader)
data.split(n_folds=5)

In [190]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1


RMSE: 0.8971
MAE:  0.6897
------------
Fold 2


RMSE: 0.8988
MAE:  0.6929
------------
Fold 3


RMSE: 0.8916
MAE:  0.6860
------------
Fold 4


RMSE: 0.9028
MAE:  0.6993
------------
Fold 5


RMSE: 0.8954
MAE:  0.6864
------------
------------
Mean RMSE: 0.8971
Mean MAE : 0.6909
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8970519585613387,
                             0.8988150172342702,
                             0.891615252774923,
                             0.9027996776472405,
                             0.8953991009253148],
                            'mae': [0.6897055888313983,
                             0.69291446597807,
                             0.6859795759190649,
                             0.6992638811528139,
                             0.686421115802861]})

In [191]:
trainset = data.build_full_trainset()
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2839b05f978>

In [192]:
raitings[raitings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [193]:
svd.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.6777398570856326, details={'was_impossible': False})

In [194]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [195]:
id_map = pd.read_csv('movieRecommaned/input/links_small.csv')[['movieId','tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']

In [196]:
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [197]:
indices_map = id_map.set_index('id')

In [198]:
def hybrid(userId, title):
    indces[title]

In [199]:
indces

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
Heat                                                     5
Sabrina                                                  6
Tom and Huck                                             7
Sudden Death                                             8
GoldenEye                                                9
The American President                                  10
Dracula: Dead and Loving It                             11
Balto                                                   12
Nixon                                                   13
Cutthroat Island                                        14
Casino                                                  15
Sense and Sensibility                             