In [1]:
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
from ast import literal_eval # 문자열 모형의 딕트를 스근하게 딕트로 바꾸어 준다. 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd.read_csv('input/movies_metadata.csv')

In [39]:
md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
year                         0
dtype: int64

In [10]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [11]:
md['genres']

0                        [Animation, Comedy, Family]
1                       [Adventure, Fantasy, Family]
2                                  [Romance, Comedy]
3                           [Comedy, Drama, Romance]
4                                           [Comedy]
5                   [Action, Crime, Drama, Thriller]
6                                  [Comedy, Romance]
7                 [Action, Adventure, Drama, Family]
8                      [Action, Adventure, Thriller]
9                      [Adventure, Action, Thriller]
10                          [Comedy, Drama, Romance]
11                                  [Comedy, Horror]
12                    [Family, Animation, Adventure]
13                                  [History, Drama]
14                               [Action, Adventure]
15                                    [Drama, Crime]
16                                  [Drama, Romance]
17                                   [Crime, Comedy]
18                        [Crime, Comedy, Adve

In [16]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [42]:
m = vote_counts.quantile(0.95)
m

434.0

In [22]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [23]:
md['year']

0        1995
1        1995
2        1995
3        1995
4        1995
5        1995
6        1995
7        1995
8        1995
9        1995
10       1995
11       1995
12       1995
13       1995
14       1995
15       1995
16       1995
17       1995
18       1995
19       1995
20       1995
21       1995
22       1995
23       1995
24       1995
25       1995
26       1995
27       1995
28       1995
29       1995
         ... 
45436    2010
45437    2017
45438    2015
45439    1944
45440    2007
45441    2002
45442    1905
45443    1901
45444    1905
45445    1906
45446    1909
45447    1904
45448    1904
45449    2005
45450    1900
45451    1900
45452    1981
45453    2017
45454    2015
45455    1972
45456    1946
45457    2000
45458    2000
45459    1995
45460    1991
45461     NaT
45462    2011
45463    2003
45464    1917
45465    2017
Name: year, Length: 45466, dtype: object

In [24]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][
    ['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [25]:
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres
0,Toy Story,1995,5415,7,21.9469,"[Animation, Comedy, Family]"
1,Jumanji,1995,2413,6,17.0155,"[Adventure, Fantasy, Family]"
5,Heat,1995,1886,7,17.9249,"[Action, Crime, Drama, Thriller]"
9,GoldenEye,1995,1194,6,14.686,"[Adventure, Action, Thriller]"
15,Casino,1995,1343,7,10.1374,"[Drama, Crime]"


In [37]:
def weighted_rating(x):
    print(x)
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)


In [40]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

title                                                   Inception
year                                                         2010
vote_count                                                  14075
vote_average                                                    8
popularity                                                29.1081
genres          [Action, Thriller, Science Fiction, Mystery, A...
wr                                                        7.91759
Name: 15480, dtype: object
title                            The Dark Knight
year                                        2008
vote_count                                 12269
vote_average                                   8
popularity                               123.167
genres          [Drama, Action, Crime, Thriller]
wr                                       7.90587
Name: 12481, dtype: object
title                                  Interstellar
year                                           2014
vote_count                           


title                               Juno
year                                2007
vote_count                          2313
vote_average                           7
popularity                       11.7916
genres          [Comedy, Drama, Romance]
wr                               6.72271
Name: 12230, dtype: object
title                      Pitch Perfect
year                                2012
vote_count                          2310
vote_average                           7
popularity                       11.0359
genres          [Comedy, Music, Romance]
wr                               6.72241
Name: 19384, dtype: object
title                      Rush
year                       2013
vote_count                 2310
vote_average                  7
popularity              10.6355
genres          [Drama, Action]
wr                      6.72241
Name: 21611, dtype: object
title               The Man from U.N.C.L.E.
year                                   2015
vote_count                      


title               The Darjeeling Limited
year                                  2007
vote_count                             876
vote_average                             7
popularity                         6.49347
genres          [Adventure, Drama, Comedy]
wr                                 6.41854
Name: 12091, dtype: object
title                          Spirit: Stallion of the Cimarron
year                                                       2002
vote_count                                                  870
vote_average                                                  7
popularity                                              16.7248
genres          [Western, Animation, Adventure, Comedy, Family]
wr                                                      6.41586
Name: 5255, dtype: object
title                         Lawrence of Arabia
year                                        1962
vote_count                                   870
vote_average                                   7
p


title           X-Men: Apocalypse
year                         2016
vote_count                   4831
vote_average                    6
popularity                28.7125
genres          [Science Fiction]
wr                        5.93776
Name: 26569, dtype: object
title                         Ted
year                         2012
vote_count                   4811
vote_average                    6
popularity                19.6386
genres          [Comedy, Fantasy]
wr                        5.93752
Name: 19109, dtype: object
title                                      Divergent
year                                            2014
vote_count                                      4784
vote_average                                       6
popularity                                   21.5726
genres          [Adventure, Action, Science Fiction]
wr                                            5.9372
Name: 22525, dtype: object
title                                        Looper
year               


title                              Hitch
year                                2005
vote_count                          1721
vote_average                           6
popularity                       13.3508
genres          [Comedy, Drama, Romance]
wr                               5.84793
Name: 9704, dtype: object
title                    The Ring
year                         2002
vote_count                   1720
vote_average                    6
popularity                 13.741
genres          [Horror, Mystery]
wr                        5.84786
Name: 5542, dtype: object
title             Tropic Thunder
year                        2008
vote_count                  1707
vote_average                   6
popularity               8.98094
genres          [Action, Comedy]
wr                       5.84693
Name: 12883, dtype: object
title              Batman Returns
year                         1992
vote_count                   1706
vote_average                    6
popularity                15


title                            Goosebumps
year                                   2015
vote_count                             1022
vote_average                              6
popularity                        12.365292
genres          [Adventure, Horror, Comedy]
wr                                  5.77492
Name: 33833, dtype: object
title             What Women Want
year                         2000
vote_count                   1021
vote_average                    6
popularity                9.96308
genres          [Comedy, Romance]
wr                        5.77477
Name: 3893, dtype: object
title           That Awkward Moment
year                           2014
vote_count                     1020
vote_average                      6
popularity                   8.0604
genres            [Comedy, Romance]
wr                          5.77461
Name: 22877, dtype: object
title                             Demolition Man
year                                        1993
vote_count             


title           Before I Go to Sleep
year                            2014
vote_count                       672
vote_average                       6
popularity                   14.9213
genres           [Mystery, Thriller]
wr                           5.70369
Name: 24075, dtype: object
title                              Peter Pan
year                                    2003
vote_count                               671
vote_average                               6
popularity                           8.98628
genres          [Adventure, Fantasy, Family]
wr                                   5.70343
Name: 7011, dtype: object
title                                       Primer
year                                          2004
vote_count                                     671
vote_average                                     6
popularity                                 7.15307
genres          [Science Fiction, Drama, Thriller]
wr                                         5.70343
Name: 8187, dty


title           Alvin and the Chipmunks: The Road Chip
year                                              2015
vote_count                                         438
vote_average                                         5
popularity                                   14.307672
genres          [Adventure, Animation, Comedy, Family]
wr                                             5.12189
Name: 38589, dtype: object
title                                                Sahara
year                                                   2005
vote_count                                              440
vote_average                                              5
popularity                                          8.94624
genres          [Action, Adventure, Comedy, Drama, Mystery]
wr                                                  5.12161
Name: 9892, dtype: object
title           The Boss
year                2016
vote_count           441
vote_average           5
popularity      7.533146
genres          


title           National Lampoon’s Van Wilder
year                                     2002
vote_count                                867
vote_average                                5
popularity                            7.66858
genres                      [Comedy, Romance]
wr                                    5.08169
Name: 5149, dtype: object
title                        Deep Impact
year                                1998
vote_count                           870
vote_average                           5
popularity                       11.6446
genres          [Action, Drama, Romance]
wr                               5.08151
Name: 1767, dtype: object
title           Daddy's Home
year                    2015
vote_count               870
vote_average               5
popularity          8.009226
genres              [Comedy]
wr                   5.08151
Name: 35362, dtype: object
title           The Bounty Hunter
year                         2010
vote_count                    878
vote_a

In [34]:
qualified = qualified.sort_values('wr', ascending=False)

In [43]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [50]:
md.apply(lambda x: pd.Series(x['genres']), axis=1).stack()

0      0          Animation
       1             Comedy
       2             Family
1      0          Adventure
       1            Fantasy
       2             Family
2      0            Romance
       1             Comedy
3      0             Comedy
       1              Drama
       2            Romance
4      0             Comedy
5      0             Action
       1              Crime
       2              Drama
       3           Thriller
6      0             Comedy
       1            Romance
7      0             Action
       1          Adventure
       2              Drama
       3             Family
8      0             Action
       1          Adventure
       2           Thriller
9      0          Adventure
       1             Action
       2           Thriller
10     0             Comedy
       1              Drama
                 ...       
45446  1             Comedy
45449  0          Animation
       1             Family
45450  0            Fantasy
       1            

In [53]:
s = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level = 1 ,drop=True)
s.name = 'genre'

In [55]:
gen_md = md.drop('genres', axis=1).join(s)

In [60]:
print(gen_md.head(10))

   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
1  False                                                NaN  65000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
3  False                                                NaN  16000000   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
0  http://toystory.disney.com/toy-stor

In [61]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [63]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.565285
351,Forrest Gump,1994,8147,8,48.3072,7.971357
876,Vertigo,1958,1162,8,18.2082,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.8451,7.745154
1132,Cinema Paradiso,1988,834,8,14.177,7.744878
19901,Paperman,2012,734,8,7.19863,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.9943,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


In [64]:
links_small = pd.read_csv('input/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [66]:
md = md.drop([19730, 29503, 35587])

In [71]:
md['id'] = md['id'].astype('int')

In [77]:
smd = md[md['id'].isin(links_small)]
smd.isnull().sum()

adult                       0
belongs_to_collection    7425
budget                      0
genres                      0
homepage                 7125
id                          0
imdb_id                     0
original_language           0
original_title              0
overview                   12
popularity                  0
poster_path                 3
production_companies        0
production_countries        0
release_date                0
revenue                     0
runtime                     0
spoken_languages            0
status                      2
tagline                  2066
title                       0
video                       0
vote_average                0
vote_count                  0
year                        0
dtype: int64

In [83]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['tagline'] + smd['overview']
smd['description'] = smd['description'].fillna('')

In [82]:
print(np.nan + 12)

nan


In [85]:
smd['description']

0        Led by Woody, Andy's toys live happily in his ...
1        Roll the dice and unleash the excitement!When ...
2        Still Yelling. Still Fighting. Still Ready for...
3        Friends are the people who let you be yourself...
4        Just When His World Is Back To Normal... He's ...
5        A Los Angeles Crime SagaObsessive master thief...
6        You are cordially invited to the most surprisi...
7        The Original Bad Boys.A mischievous young boy,...
8        Terror goes into overtime.International action...
9        No limits. No fears. No substitutes.James Bond...
10       Why can't the most powerful man in the world h...
11       When a lawyer shows up at the vampire's doorst...
12       Part Dog. Part Wolf. All Hero.An outcast half-...
13       Triumphant in Victory, Bitter in Defeat. He Ch...
14       The Course Has Been Set. There Is No Turning B...
15       No one stays at the top forever.The life of th...
16       Lose your heart and come to your senses.Rich M.

In [108]:
#문서 집합에서 단어 토큰을 생성하고 각 단어의 수를 세어 BOW 인코딩한 벡터를 만든다. + 가중치 
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
print(tfidf_matrix)

  (0, 136840)	0.07718759464705006
  (0, 264113)	0.30387952009598945
  (0, 10530)	0.2930799401609535
  (0, 243441)	0.11055554348807796
  (0, 141316)	0.06263979034622161
  (0, 106654)	0.09279805284220312
  (0, 203921)	0.08404731309794934
  (0, 24126)	0.08975329559061745
  (0, 29322)	0.07597295822242066
  (0, 32250)	0.3402899285020227
  (0, 140005)	0.1291666389476973
  (0, 207924)	0.08325132345046594
  (0, 6226)	0.09481888070772156
  (0, 144369)	0.08076935717793579
  (0, 181154)	0.06719562459211752
  (0, 108551)	0.06824260207674711
  (0, 182934)	0.0976933133869845
  (0, 40994)	0.09397604186799674
  (0, 212158)	0.0982339106666728
  (0, 174041)	0.07485331602988303
  (0, 70879)	0.09136444459528242
  (0, 78142)	0.07411858432790196
  (0, 136355)	0.07193977307988235
  (0, 14334)	0.10518838664786415
  (0, 63871)	0.09666972605738713
  :	:
  (9097, 244431)	0.07716709594495275
  (9097, 5443)	0.07716709594495275
  (9097, 14256)	0.07716709594495275
  (9097, 231044)	0.07716709594495275
  (9097, 199061

In [98]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.00680204, 0.        , ..., 0.        , 0.00344826,
        0.        ],
       [0.00680204, 1.        , 0.01537897, ..., 0.00356808, 0.00762316,
        0.        ],
       [0.        , 0.01537897, 1.        , ..., 0.        , 0.00288257,
        0.00473726],
       ...,
       [0.        , 0.00356808, 0.        , ..., 1.        , 0.07824314,
        0.        ],
       [0.00344826, 0.00762316, 0.00288257, ..., 0.07824314, 1.        ,
        0.        ],
       [0.        , 0.        , 0.00473726, ..., 0.        , 0.        ,
        1.        ]])

In [94]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [122]:
def recommanedSystem(name):
    idx = indices[name]
    sims_scores = list(enumerate(cosine_sim[idx]))
    sims_scores = sorted(sims_scores, key=lambda x:x[1], reverse=True)
    sims_scores = sims_scores[1:31]
    movie_indices = [i[0] for i in sims_scores]
    return titles.iloc[movie_indices]

In [None]:
recommanedSystem('Toy Story')

0                                               Toy Story
1                                                 Jumanji
2                                        Grumpier Old Men
3                                       Waiting to Exhale
4                             Father of the Bride Part II
5                                                    Heat
6                                                 Sabrina
7                                            Tom and Huck
8                                            Sudden Death
9                                               GoldenEye
10                                 The American President
11                            Dracula: Dead and Loving It
12                                                  Balto
13                                                  Nixon
14                                       Cutthroat Island
15                                                 Casino
16                                  Sense and Sensibility
17            