In [1]:
%matplotlib inline
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
from ast import literal_eval # 문자열 모형의 딕트를 스근하게 딕트로 바꾸어 준다. 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd.read_csv('input/movies_metadata.csv')

In [3]:
md.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [4]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [5]:
md['genres']

0                        [Animation, Comedy, Family]
1                       [Adventure, Fantasy, Family]
2                                  [Romance, Comedy]
3                           [Comedy, Drama, Romance]
4                                           [Comedy]
5                   [Action, Crime, Drama, Thriller]
6                                  [Comedy, Romance]
7                 [Action, Adventure, Drama, Family]
8                      [Action, Adventure, Thriller]
9                      [Adventure, Action, Thriller]
10                          [Comedy, Drama, Romance]
11                                  [Comedy, Horror]
12                    [Family, Animation, Adventure]
13                                  [History, Drama]
14                               [Action, Adventure]
15                                    [Drama, Crime]
16                                  [Drama, Romance]
17                                   [Crime, Comedy]
18                        [Crime, Comedy, Adve

In [8]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C


5.244896612406511

In [9]:
m = vote_counts.quantile(0.95)
m

434.0

In [10]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [11]:
md['year']

0        1995
1        1995
2        1995
3        1995
4        1995
5        1995
6        1995
7        1995
8        1995
9        1995
10       1995
11       1995
12       1995
13       1995
14       1995
15       1995
16       1995
17       1995
18       1995
19       1995
20       1995
21       1995
22       1995
23       1995
24       1995
25       1995
26       1995
27       1995
28       1995
29       1995
         ... 
45436    2010
45437    2017
45438    2015
45439    1944
45440    2007
45441    2002
45442    1905
45443    1901
45444    1905
45445    1906
45446    1909
45447    1904
45448    1904
45449    2005
45450    1900
45451    1900
45452    1981
45453    2017
45454    2015
45455    1972
45456    1946
45457    2000
45458    2000
45459    1995
45460    1991
45461     NaT
45462    2011
45463    2003
45464    1917
45465    2017
Name: year, Length: 45466, dtype: object

In [12]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][
    ['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [None]:
qualified = qualified.sort_values(by='year', ascending=False)
qualified

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres
0,Toy Story,1995,5415,7,21.9469,"[Animation, Comedy, Family]"
1,Jumanji,1995,2413,6,17.0155,"[Adventure, Fantasy, Family]"
5,Heat,1995,1886,7,17.9249,"[Action, Crime, Drama, Thriller]"
9,GoldenEye,1995,1194,6,14.686,"[Adventure, Action, Thriller]"
15,Casino,1995,1343,7,10.1374,"[Drama, Crime]"
17,Four Rooms,1995,539,6,9.02659,"[Crime, Comedy]"
18,Ace Ventura: When Nature Calls,1995,1128,6,8.20545,"[Crime, Comedy, Adventure]"
31,Twelve Monkeys,1995,2470,7,12.2973,"[Science Fiction, Thriller, Mystery]"
33,Babe,1995,756,6,14.4048,"[Fantasy, Drama, Comedy, Family]"
38,Clueless,1995,828,6,9.88238,"[Comedy, Drama, Romance]"


In [17]:
def weighted_rating(x):
    print(x)
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)


In [18]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

title                                              The Dark Tower
year                                                         2017
vote_count                                                    688
vote_average                                                    5
popularity                                              50.903593
genres          [Action, Western, Science Fiction, Fantasy, Ho...
Name: 45014, dtype: object
title           xXx: Return of Xander Cage
year                                  2017
vote_count                            1497
vote_average                             5
popularity                       17.918269
genres          [Action, Adventure, Crime]
Name: 41972, dtype: object
title             The Fate of the Furious
year                                 2017
vote_count                           3803
vote_average                            6
popularity                      48.573287
genres          [Action, Crime, Thriller]
Name: 43255, dtype: object
title        


title           The Invitation
year                      2015
vote_count                 486
vote_average                 6
popularity            9.248388
genres              [Thriller]
Name: 33474, dtype: object
title                    No Escape
year                          2015
vote_count                     798
vote_average                     6
popularity                 14.1326
genres          [Action, Thriller]
Name: 32371, dtype: object
title           The Brand New Testament
year                               2015
vote_count                          435
vote_average                          6
popularity                     6.488451
genres                [Comedy, Fantasy]
Name: 33026, dtype: object
title                            Solace
year                               2015
vote_count                          740
vote_average                          6
popularity                    12.642675
genres          [Crime, Drama, Mystery]
Name: 34516, dtype: object
title          


title                             The Guest
year                                   2014
vote_count                              661
vote_average                              6
popularity                          6.44196
genres          [Mystery, Thriller, Action]
Name: 24038, dtype: object
title           As Above, So Below
year                          2014
vote_count                     788
vote_average                     6
popularity                 10.8466
genres          [Horror, Thriller]
Name: 24015, dtype: object
title           If I Stay
year                 2014
vote_count           1415
vote_average            7
popularity        9.65924
genres            [Drama]
Name: 23962, dtype: object
title           The Hundred-Foot Journey
year                                2014
vote_count                           516
vote_average                           7
popularity                       14.4668
genres                           [Drama]
Name: 23877, dtype: object
title          


title                                    After Earth
year                                            2013
vote_count                                      2579
vote_average                                       5
popularity                                   10.7476
genres          [Science Fiction, Action, Adventure]
Name: 21018, dtype: object
title           Dallas Buyers Club
year                          2013
vote_count                    2973
vote_average                     7
popularity                 14.7857
genres            [Drama, History]
Name: 21948, dtype: object
title                      Snitch
year                         2013
vote_count                   1155
vote_average                    5
popularity                13.5427
genres          [Thriller, Drama]
Name: 20498, dtype: object
title                             The Best Offer
year                                        2013
vote_count                                   719
vote_average                           


title           A Separation
year                    2011
vote_count               474
vote_average               7
popularity           7.93963
genres               [Drama]
Name: 17821, dtype: object
title                       Rise of the Planet of the Apes
year                                                  2011
vote_count                                            4452
vote_average                                             7
popularity                                         41.6138
genres          [Thriller, Action, Drama, Science Fiction]
Name: 17588, dtype: object
title                                                      Priest
year                                                         2011
vote_count                                                    710
vote_average                                                    5
popularity                                                8.25995
genres          [Action, Science Fiction, Fantasy, Thriller, H...
Name: 17108, dtype: o

title                           Hot Tub Time Machine
year                                            2010
vote_count                                       911
vote_average                                       5
popularity                                   11.9677
genres          [Science Fiction, Comedy, Adventure]
Name: 14977, dtype: object
title                                             Repo Men
year                                                  2010
vote_count                                             627
vote_average                                             6
popularity                                         7.50297
genres          [Action, Science Fiction, Thriller, Crime]
Name: 14963, dtype: object
title           The Bounty Hunter
year                         2010
vote_count                    878
vote_average                    5
popularity                6.71526
genres                   [Action]
Name: 14943, dtype: object
title               Frozen
year            


title                      Doubt
year                        2008
vote_count                   445
vote_average                   7
popularity               9.27826
genres          [Drama, Mystery]
Name: 13184, dtype: object
title                  The Hurt Locker
year                              2008
vote_count                        1881
vote_average                         7
popularity                     9.40372
genres          [Drama, Thriller, War]
Name: 13817, dtype: object
title                   Asterix at the Olympic Games
year                                            2008
vote_count                                       486
vote_average                                       5
popularity                                   9.67194
genres          [Fantasy, Adventure, Comedy, Family]
Name: 14710, dtype: object
title                               Inkheart
year                                    2008
vote_count                               610
vote_average                     


title           The Lives of Others
year                           2006
vote_count                      977
vote_average                      7
popularity                  9.02255
genres            [Drama, Thriller]
Name: 10863, dtype: object
title           The Hills Have Eyes
year                           2006
vote_count                      726
vote_average                      6
popularity                  18.9605
genres           [Horror, Thriller]
Name: 10857, dtype: object
title                         Inside Man
year                                2006
vote_count                          1671
vote_average                           7
popularity                       13.3935
genres          [Crime, Drama, Thriller]
Name: 10843, dtype: object
title                                  Monster House
year                                            2006
vote_count                                       912
vote_average                                       6
popularity                  


title                              Big Fish
year                                   2003
vote_count                             2064
vote_average                              7
popularity                            14.74
genres          [Adventure, Fantasy, Drama]
Name: 6994, dtype: object
title                      Love Actually
year                                2003
vote_count                          1917
vote_average                           7
popularity                       10.5868
genres          [Comedy, Romance, Drama]
Name: 6791, dtype: object
title                        Bad Santa
year                              2003
vote_count                         666
vote_average                         6
popularity                     10.9301
genres          [Drama, Comedy, Crime]
Name: 6805, dtype: object
title                      Gothika
year                          2003
vote_count                     643
vote_average                     5
popularity                 10.9391
ge


title                    Scream 3
year                         2000
vote_count                    749
vote_average                    5
popularity                9.73837
genres          [Horror, Mystery]
Name: 3155, dtype: object
title                                    Hollow Man
year                                           2000
vote_count                                      645
vote_average                                      5
popularity                                   7.9121
genres          [Action, Science Fiction, Thriller]
Name: 3703, dtype: object
title                                      The Cell
year                                           2000
vote_count                                      442
vote_average                                      6
popularity                                  13.2349
genres          [Horror, Science Fiction, Thriller]
Name: 3739, dtype: object
title           Bring It On
year                   2000
vote_count              574
vote_aver


title                                       Eraser
year                                          1996
vote_count                                     553
vote_average                                     5
popularity                                 6.91931
genres          [Action, Drama, Mystery, Thriller]
Name: 766, dtype: object
title                                   The Nutty Professor
year                                                   1996
vote_count                                              717
vote_average                                              5
popularity                                          9.65158
genres          [Fantasy, Comedy, Romance, Science Fiction]
Name: 768, dtype: object
title                           Sleepers
year                                1996
vote_count                           729
vote_average                           7
popularity                       9.19828
genres          [Crime, Drama, Thriller]
Name: 1031, dtype: object
title      


title           A Fish Called Wanda
year                           1988
vote_count                      576
vote_average                      7
popularity                  11.4004
genres              [Comedy, Crime]
Name: 1047, dtype: object
title                                  Akira
year                                    1988
vote_count                               792
vote_average                               7
popularity                           10.8889
genres          [Science Fiction, Animation]
Name: 1229, dtype: object
title              Beverly Hills Cop II
year                               1987
vote_count                          627
vote_average                          6
popularity                      9.20034
genres          [Action, Comedy, Crime]
Name: 3958, dtype: object
title              Wall Street
year                      1987
vote_count                 557
vote_average                 7
popularity             11.6727
genres          [Crime, Drama]
Name: 388

In [20]:
qualified = qualified.sort_values('wr', ascending=False)

In [21]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.95,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.6454,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.3244,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.3072,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.4235,"[Adventure, Fantasy, Action]",7.851924


In [22]:
md.apply(lambda x: pd.Series(x['genres']), axis=1).stack()

0      0          Animation
       1             Comedy
       2             Family
1      0          Adventure
       1            Fantasy
       2             Family
2      0            Romance
       1             Comedy
3      0             Comedy
       1              Drama
       2            Romance
4      0             Comedy
5      0             Action
       1              Crime
       2              Drama
       3           Thriller
6      0             Comedy
       1            Romance
7      0             Action
       1          Adventure
       2              Drama
       3             Family
8      0             Action
       1          Adventure
       2           Thriller
9      0          Adventure
       1             Action
       2           Thriller
10     0             Comedy
       1              Drama
                 ...       
45446  1             Comedy
45449  0          Animation
       1             Family
45450  0            Fantasy
       1            

In [23]:
s = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level = 1 ,drop=True)
s

0              Animation
0                 Comedy
0                 Family
1              Adventure
1                Fantasy
1                 Family
2                Romance
2                 Comedy
3                 Comedy
3                  Drama
3                Romance
4                 Comedy
5                 Action
5                  Crime
5                  Drama
5               Thriller
6                 Comedy
6                Romance
7                 Action
7              Adventure
7                  Drama
7                 Family
8                 Action
8              Adventure
8               Thriller
9              Adventure
9                 Action
9               Thriller
10                Comedy
10                 Drama
              ...       
45446             Comedy
45449          Animation
45449             Family
45450            Fantasy
45450             Action
45450           Thriller
45451             Comedy
45451            Fantasy
45452        Documentary


In [24]:
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [27]:

def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [28]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.565285
351,Forrest Gump,1994,8147,8,48.3072,7.971357
876,Vertigo,1958,1162,8,18.2082,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.8451,7.745154
1132,Cinema Paradiso,1988,834,8,14.177,7.744878
19901,Paperman,2012,734,8,7.19863,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.9943,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


In [29]:
links_small = pd.read_csv('input/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [30]:
md = md.drop([19730, 29503, 35587])

In [32]:
md['id'] = md['id'].astype('int')

In [33]:
smd = md[md['id'].isin(links_small)]
smd.isnull().sum()

adult                       0
belongs_to_collection    7425
budget                      0
genres                      0
homepage                 7125
id                          0
imdb_id                     0
original_language           0
original_title              0
overview                   12
popularity                  0
poster_path                 3
production_companies        0
production_countries        0
release_date                0
revenue                     0
runtime                     0
spoken_languages            0
status                      2
tagline                  2066
title                       0
video                       0
vote_average                0
vote_count                  0
year                        0
dtype: int64

In [34]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['tagline'] + smd['overview']
smd['description'] = smd['description'].fillna('')

In [35]:
print(np.nan + 12)

nan


In [36]:
smd['description']

0        Led by Woody, Andy's toys live happily in his ...
1        Roll the dice and unleash the excitement!When ...
2        Still Yelling. Still Fighting. Still Ready for...
3        Friends are the people who let you be yourself...
4        Just When His World Is Back To Normal... He's ...
5        A Los Angeles Crime SagaObsessive master thief...
6        You are cordially invited to the most surprisi...
7        The Original Bad Boys.A mischievous young boy,...
8        Terror goes into overtime.International action...
9        No limits. No fears. No substitutes.James Bond...
10       Why can't the most powerful man in the world h...
11       When a lawyer shows up at the vampire's doorst...
12       Part Dog. Part Wolf. All Hero.An outcast half-...
13       Triumphant in Victory, Bitter in Defeat. He Ch...
14       The Course Has Been Set. There Is No Turning B...
15       No one stays at the top forever.The life of th...
16       Lose your heart and come to your senses.Rich M.

In [37]:
#문서 집합에서 단어 토큰을 생성하고 각 단어의 수를 세어 BOW 인코딩한 벡터를 만든다. + 가중치 
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
print(tfidf_matrix)

  (0, 136840)	0.07718759464705006
  (0, 264113)	0.30387952009598945
  (0, 10530)	0.2930799401609535
  (0, 243441)	0.11055554348807796
  (0, 141316)	0.06263979034622161
  (0, 106654)	0.09279805284220312
  (0, 203921)	0.08404731309794934
  (0, 24126)	0.08975329559061745
  (0, 29322)	0.07597295822242066
  (0, 32250)	0.3402899285020227
  (0, 140005)	0.1291666389476973
  (0, 207924)	0.08325132345046594
  (0, 6226)	0.09481888070772156
  (0, 144369)	0.08076935717793579
  (0, 181154)	0.06719562459211752
  (0, 108551)	0.06824260207674711
  (0, 182934)	0.0976933133869845
  (0, 40994)	0.09397604186799674
  (0, 212158)	0.0982339106666728
  (0, 174041)	0.07485331602988303
  (0, 70879)	0.09136444459528242
  (0, 78142)	0.07411858432790196
  (0, 136355)	0.07193977307988235
  (0, 14334)	0.10518838664786415
  (0, 63871)	0.09666972605738713
  :	:
  (9097, 244431)	0.07716709594495275
  (9097, 5443)	0.07716709594495275
  (9097, 14256)	0.07716709594495275
  (9097, 231044)	0.07716709594495275
  (9097, 199061

In [38]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.00680204, 0.        , ..., 0.        , 0.00344826,
        0.        ],
       [0.00680204, 1.        , 0.01537897, ..., 0.00356808, 0.00762316,
        0.        ],
       [0.        , 0.01537897, 1.        , ..., 0.        , 0.00288257,
        0.00473726],
       ...,
       [0.        , 0.00356808, 0.        , ..., 1.        , 0.07824314,
        0.        ],
       [0.00344826, 0.00762316, 0.00288257, ..., 0.07824314, 1.        ,
        0.        ],
       [0.        , 0.        , 0.00473726, ..., 0.        , 0.        ,
        1.        ]])

In [39]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9099 entries, 0 to 9098
Data columns (total 27 columns):
index                    9099 non-null int64
adult                    9099 non-null object
belongs_to_collection    1674 non-null object
budget                   9099 non-null object
genres                   9099 non-null object
homepage                 1974 non-null object
id                       9099 non-null int32
imdb_id                  9099 non-null object
original_language        9099 non-null object
original_title           9099 non-null object
overview                 9087 non-null object
popularity               9099 non-null object
poster_path              9096 non-null object
production_companies     9099 non-null object
production_countries     9099 non-null object
release_date             9099 non-null object
revenue                  9099 non-null float64
runtime                  9099 non-null float64
spoken_languages         9099 non-null object
status             

In [42]:
def recommanedSystem(name):
    idx = indices[name]
    sims_scores = list(enumerate(cosine_sim[idx]))
    sims_scores = sorted(sims_scores, key=lambda x:x[1], reverse=True)
    sims_scores = sims_scores[1:31]
    movie_indices = [i[0] for i in sims_scores]
    return titles.iloc[movie_indices]

In [43]:
recommanedSystem('The Dark Knight')

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
7242                  The File on Thelma Jordon
3537                               Criminal Law
2893                              Flying Tigers
1135                   Night Falls on Manhattan
8680                          The Young Savages
8917         Batman v Superman: Dawn of 

In [44]:
credits = pd.read_csv('input/credits.csv')
keywords = pd.read_csv('input/keywords.csv')

In [48]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [51]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [56]:
smd = md[md['id'].isin(links_small)]

In [57]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))


In [58]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [59]:
smd['director'] = smd['crew'].apply(get_director)

In [60]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)

In [63]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [64]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [68]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])


In [70]:
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [71]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [72]:
s = s[s > 1] #언급 정도가 1개 이상만 사용되도록한다. 

In [73]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [75]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [76]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x] )
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])

In [78]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [82]:
for x, i in enumerate(smd['soup']):
    if x == 10:
        break
    print(i)

jealousi toy boy friendship friend rivalri boynextdoor newtoy toycomestolif tomhanks timallen donrickles ['johnlasseter','johnlasseter','johnlasseter'] ['johnlasseter','johnlasseter','johnlasseter'] ['johnlasseter','johnlasseter','johnlasseter'] Animation Comedy Family
boardgam disappear basedonchildren'sbook newhom reclus giantinsect robinwilliams jonathanhyde kirstendunst ['joejohnston','joejohnston','joejohnston'] ['joejohnston','joejohnston','joejohnston'] ['joejohnston','joejohnston','joejohnston'] Adventure Fantasy Family
fish bestfriend duringcreditssting waltermatthau jacklemmon ann-margret ['howarddeutch','howarddeutch','howarddeutch'] ['howarddeutch','howarddeutch','howarddeutch'] ['howarddeutch','howarddeutch','howarddeutch'] Romance Comedy
basedonnovel interracialrelationship singlemoth divorc chickflick whitneyhouston angelabassett lorettadevine ['forestwhitaker','forestwhitaker','forestwhitaker'] ['forestwhitaker','forestwhitaker','forestwhitaker'] ['forestwhitaker','fore

In [84]:
count = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 2), min_df=0)
count_matrix = count.fit_transform(smd['soup'])
count_matrix

<9219x107377 sparse matrix of type '<class 'numpy.int64'>'
	with 240050 stored elements in Compressed Sparse Row format>

In [86]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [87]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [90]:
recommanedSystem('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
1134                Batman Returns
Name: title, dtype: object

In [98]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:15]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified


In [100]:
improved_recommendations('The Dark Knight')

title           The Dark Knight Rises
vote_count                       9263
vote_average                        7
year                             2012
Name: 8031, dtype: object
title           Batman Begins
vote_count               7511
vote_average                7
year                     2005
Name: 6218, dtype: object
title           The Prestige
vote_count              4510
vote_average               8
year                    2006
Name: 6623, dtype: object
title           Inception
vote_count          14075
vote_average            8
year                 2010
Name: 7648, dtype: object
title           Memento
vote_count         4168
vote_average          8
year               2000
Name: 3381, dtype: object
title           Interstellar
vote_count             11187
vote_average               8
year                    2014
Name: 8613, dtype: object


Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.917588
8613,Interstellar,11187,8,2014,7.897107
6623,The Prestige,4510,8,2006,7.758148
3381,Memento,4168,8,2000,7.740175
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127


In [101]:
reader = Reader()

In [102]:
ratings = pd.read_csv('input/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [104]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [106]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.



------------
Fold 1


RMSE: 0.8999
MAE:  0.6931
------------
Fold 2


RMSE: 0.8941
MAE:  0.6875
------------
Fold 3


RMSE: 0.8997
MAE:  0.6925
------------
Fold 4


RMSE: 0.8954
MAE:  0.6906


------------
Fold 5


RMSE: 0.8939
MAE:  0.6901
------------
------------
Mean RMSE: 0.8966
Mean MAE : 0.6907
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8998947958731333,
                             0.8941405724668805,
                             0.8997046532096684,
                             0.8954219159821057,
                             0.8939219515359959],
                            'mae': [0.6930500310270069,
                             0.6875069635567772,
                             0.6924632750697532,
                             0.6905910026958333,
                             0.6900622907348385]})

In [108]:
trainset = data.build_full_trainset()
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2104d4d0cc0>

In [113]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [116]:
svd.predict(1, 310, 3)

Prediction(uid=1, iid=310, r_ui=3, est=2.6995416583901966, details={'was_impossible': False})

In [122]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [135]:
id_map = pd.read_csv('input/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
print(id_map.head(3))
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
id_map.head(3)

   movieId       id
0        1    862.0
1        2   8844.0
2        3  15602.0


Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
Jumanji,2,8844.0
Grumpier Old Men,3,15602.0


In [148]:
id_map

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
Jumanji,2,8844.0
Grumpier Old Men,3,15602.0
Waiting to Exhale,4,31357.0
Father of the Bride Part II,5,11862.0
Heat,6,949.0
Sabrina,7,11860.0
Tom and Huck,8,45325.0
Sudden Death,9,9091.0
GoldenEye,10,710.0


In [138]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [150]:
hybrid(3, 'The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,id,est
6623,The Prestige,4510.0,8.0,2006,1124,4.099173
8613,Interstellar,11187.0,8.1,2014,157336,4.063391
3381,Memento,4168.0,8.1,2000,77,4.055961
7648,Inception,14075.0,8.1,2010,27205,3.773408
8031,The Dark Knight Rises,9263.0,7.6,2012,49026,3.661777
6218,Batman Begins,7511.0,7.5,2005,272,3.654163
5943,Thursday,84.0,7.0,1998,9812,3.640293
5809,Point Blank,97.0,7.1,1967,26039,3.610704
8001,Batman: Year One,255.0,7.1,2011,69735,3.519598
7582,Defendor,197.0,6.5,2009,34769,3.516293
