In [1]:
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
from ast import literal_eval # 문자열 모형의 딕트를 스근하게 딕트로 바꾸어 준다. 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd.read_csv('input/movies_metadata.csv')

In [3]:
md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [4]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [4]:
md['genres'] = md['genres'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [5]:
md['year'] = md['release_date'].apply(lambda x:str(x).split('-')[0] if x != np.nan else np.nan)

In [6]:
vote_count = md['vote_count']
vote_average = md['vote_average']
m = vote_count.quantile(0.95)
c = vote_average.mean()
c

5.618207215134185

In [7]:
def get_wr(x):
    v = x['vote_count']
    r = x['vote_average']
    return (v / (v + m) * r) + (m/(v+m)*c)


In [8]:

qualified = md[(md['vote_count'].notnull()) & (md['vote_average'].notnull()) & (md['vote_count'] >= m)][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype(int)
qualified['vote_average'] = qualified['vote_average'].astype(int)

In [9]:
qualified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2274 entries, 0 to 45014
Data columns (total 6 columns):
title           2274 non-null object
year            2274 non-null object
vote_count      2274 non-null int32
vote_average    2274 non-null int32
popularity      2274 non-null object
genres          2274 non-null object
dtypes: int32(2), object(4)
memory usage: 106.6+ KB


In [10]:
qualified['wr'] = qualified.apply(get_wr, axis=1)

In [11]:
qualified = qualified.sort_values('wr', ascending=False)

In [12]:
s_gen = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)

In [13]:
s_gen.name = 'genres' 
s_gen = md.drop('genres', axis=1).join(s_gen)

In [14]:
def bar_chart(genres, percent=0.85):
    df = s_gen[s_gen['genres'] == genres]
    vote_count = df[df['vote_count'].notnull()]['vote_count']
    vote_average = df[df['vote_average'].notnull()]['vote_average']
    m = vote_count.quantile(percent)
    c = vote_average.mean()
    qualified = df[(df['vote_count'].notnull()) & (df['vote_count'] >= m) & df['vote_average'].notnull()][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype(int)
    qualified['vote_average'] = qualified['vote_average'].astype(int)
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * c), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    return qualified

In [15]:
bar_chart('Adventure').head(10)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.920697
22879,Interstellar,2014,11187,8,32.2135,7.901099
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,7.87693
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,7.867537
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,7.858018
256,Star Wars,1977,6778,8,42.1497,7.841181
1225,Back to the Future,1985,6239,8,25.7785,7.828477
1154,The Empire Strikes Back,1980,5998,8,19.471,7.822115
5481,Spirited Away,2001,3968,8,41.0489,7.741285
9698,Howl's Moving Castle,2004,2049,8,16.136,7.546475


In [16]:
bar_chart('Romance').head(3)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.613461
351,Forrest Gump,1994,8147,8,48.3072,7.975754
876,Vertigo,1958,1162,8,18.2082,7.840579


In [17]:
bar_chart('Thriller').head(3)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
15480,Inception,2010,14075,8,29.1081,7.962878
12481,The Dark Knight,2008,12269,8,123.167,7.957511
292,Pulp Fiction,1994,8670,8,140.95,7.940315


In [18]:
link_small = pd.read_csv('input/links_small.csv')
link_small = link_small[link_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [19]:
md = md.drop([19730, 29503, 35587])

In [20]:
md['id'] = md['id'].astype('int')

In [21]:
smd = md[md['id'].isin(link_small)]

In [27]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [28]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [29]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [32]:
smd = smd.reset_index()

In [36]:
titles = smd['title']
indces = pd.Series(smd.index, index=titles)

In [38]:
def getrecommandations(title):
    index = indces[title]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores] 
    return titles.iloc[movie_indices]

In [39]:
getrecommandations('The Dark Knight')

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
7242                  The File on Thelma Jordon
3537                               Criminal Law
2893                              Flying Tigers
1135                   Night Falls on Manhattan
8680                          The Young Savages
8917         Batman v Superman: Dawn of 

In [41]:
print(list(enumerate(cosine_sim[0])))

[(0, 1.0000000000000018), (1, 0.0068047556717484225), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.006787806467693803), (17, 0.012752661542485267), (18, 0.0), (19, 0.0), (20, 0.004436513622462533), (21, 0.0), (22, 0.0035114754610620874), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.014778802381970275), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.008376177570083292), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0035157893614046697), (42, 0.0), (43, 0.0), (44, 0.00417096144994564), (45, 0.0), (46, 0.0), (47, 0.0), (48, 0.006490627286836497), (49, 0.004844838982980995), (50, 0.0), (51, 0.0), (52, 0.00810046370192087), (53, 0.008466183599812206), (54, 0.00835779370046548), (55, 0.0), (56, 0.01406635985802028), (57, 0.0), (58, 0.0), (59, 0.003226395273549717), (60, 0.0), (61, 0.003124215660565687), (62, 0.0), (63, 0.0), (6