In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

# Data 불러오기

In [3]:
data = pd.read_csv('./dataset/merged_data.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2291 entries, 0 to 2290
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genres             2291 non-null   object 
 1   id                 2291 non-null   int64  
 2   original_language  2291 non-null   object 
 3   overview           2291 non-null   object 
 4   popularity         2291 non-null   float64
 5   spoken_languages   2290 non-null   object 
 6   title              2291 non-null   object 
 7   vote_average       2291 non-null   float64
 8   vote_count         2291 non-null   int64  
 9   score              2291 non-null   float64
 10  cast               2290 non-null   object 
 11  crew               2291 non-null   object 
 12  keywords           2266 non-null   object 
 13  movieId            2291 non-null   int64  
 14  rating             2291 non-null   float64
dtypes: float64(4), int64(3), object(8)
memory usage: 268.6+ KB


In [5]:
data.drop(['rating'], axis=1, inplace=True)

- 이미 ratings 파일과 merge되었기 때문에, 해당 열을 다시 삭제

In [6]:
len(data) == len(data['title'].unique())

False

- title이 같은 영화가 있음
- movieId를 사용하는 것이 가장 적절하나, 편의상 중복을 삭제하여 title을 사용

In [7]:
data.drop_duplicates('title')

Unnamed: 0,genres,id,original_language,overview,popularity,spoken_languages,title,vote_average,vote_count,score,cast,crew,keywords,movieId
0,Animation Comedy Family,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,English,Toy Story,7.7,5415,7.545539,TomHanks TimAllen DonRickles JimVarney Wallace...,JohnLasseter,jealousy toy boy friendship friends rivalry bo...,1
1,Adventure Fantasy Family,8844,en,When siblings Judy and Peter discover an encha...,17.015539,English Français,Jumanji,6.9,2413,6.704621,RobinWilliams JonathanHyde KirstenDunst Bradle...,JoeJohnston,board game disappearance based on children's b...,2
2,Action Crime Drama Thriller,949,en,"Obsessive master thief, Neil McCauley leads a ...",17.924927,English Español,Heat,7.7,1886,7.310584,AlPacino RobertDeNiro ValKilmer JonVoight TomS...,MichaelMann,robbery detective bank obsession chase shootin...,6
3,Adventure Action Thriller,710,en,James Bond must unmask the mysterious head of ...,14.686036,English Pусский Español,GoldenEye,6.6,1194,6.338302,PierceBrosnan SeanBean IzabellaScorupco FamkeJ...,MartinCampbell,cuba falsely accused secret identity computer ...,10
4,Drama Crime,524,en,The life of the gambling paradise – Las Vegas ...,10.137389,English,Casino,7.8,1343,7.267167,RobertDeNiro SharonStone JoePesci JamesWoods D...,MartinScorsese,poker drug abuse 1970s overdose illegal prosti...,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2286,ScienceFiction Thriller,406990,en,In a world where families are limited to one c...,60.581223,English,What Happened to Monday,7.3,598,6.592786,NoomiRapace MarwanKenzari ChristianRubeck Glen...,TommyWirkola,chase false identity overpopulation investigat...,173925
2287,Action Thriller,341013,en,An undercover MI6 agent is sent to Berlin duri...,14.455104,svenska English Deutsch Pусский,Atomic Blonde,6.1,748,5.923143,CharlizeTheron JamesMcAvoy SofiaBoutella Eddie...,DavidLeitch,berlin spy undercover cold war double agent un...,173941
2288,Action Drama History Thriller War,374720,en,The miraculous evacuation of Allied soldiers f...,30.938854,English Français Deutsch,Dunkirk,7.5,2712,7.240418,FionnWhitehead TomGlynn-Carney JackLowden Harr...,ChristopherNolan,france beach world war ii evacuation german pi...,174055
2289,Action ScienceFiction Thriller Adventure,335988,en,"Autobots and Decepticons are at war, with huma...",39.186819,English,Transformers: The Last Knight,6.2,1440,6.065291,MarkWahlberg JoshDuhamel LauraHaddock AnthonyH...,MichaelBay,knight transformers,174585


In [8]:
ratings = pd.read_csv('./dataset/ratings.csv')

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


지금은 userId 컬럼과 movieId 컬럼이 따로따로 존재합니다.   
아이템 기반 협업 필터링(item based collaborative filtering) 기반으로 추천 시스템을 만드려면 user-item 테이블로 만들어주어야 합니다.

pivot_table을 이용해서 만들어주겠습니다.

In [10]:
ratings.drop('timestamp', axis=1, inplace=True)

# Pivoting

In [11]:
data = pd.merge(data, ratings)
data.shape

(16441173, 16)

In [12]:
pivot = data.pivot_table('rating', index = 'title', columns = 'userId').fillna(0)

In [13]:
pivot.shape

(2240, 268071)

In [14]:
pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,270887,270888,270889,270890,270891,270892,270893,270894,270895,270896
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"10,000 BC",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# cosine similarity

In [15]:
movie_sim = cosine_similarity(pivot, pivot)
movie_sim.shape

(2240, 2240)

In [16]:
movie_sim

array([[1.        , 0.16128458, 0.24146379, ..., 0.02490169, 0.05886088,
        0.12340987],
       [0.16128458, 1.        , 0.0749956 , ..., 0.07458865, 0.06283113,
        0.10884767],
       [0.24146379, 0.0749956 , 1.        , ..., 0.02283157, 0.07210706,
        0.13650028],
       ...,
       [0.02490169, 0.07458865, 0.02283157, ..., 1.        , 0.12385108,
        0.0821507 ],
       [0.05886088, 0.06283113, 0.07210706, ..., 0.12385108, 1.        ,
        0.19567401],
       [0.12340987, 0.10884767, 0.13650028, ..., 0.0821507 , 0.19567401,
        1.        ]])

In [17]:
movie_sim_df = pd.DataFrame(data = movie_sim, index = pivot.index, columns = pivot.index)

In [18]:
movie_sim_df.head()

title,(500) Days of Summer,10 Cloverfield Lane,10 Things I Hate About You,"10,000 BC",101 Dalmatians,12 Angry Men,12 Years a Slave,127 Hours,13 Going on 30,13 Hours: The Secret Soldiers of Benghazi,...,Zoolander,Zoolander 2,Zootopia,[REC],[REC]²,eXistenZ,xXx,xXx: Return of Xander Cage,xXx: State of the Union,Æon Flux
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer,1.0,0.161285,0.241464,0.126556,0.102766,0.169856,0.231415,0.301303,0.217456,0.065402,...,0.204045,0.065484,0.179281,0.126878,0.070502,0.087719,0.116647,0.024902,0.058861,0.12341
10 Cloverfield Lane,0.161285,1.0,0.074996,0.087457,0.044924,0.099667,0.227109,0.20781,0.08023,0.184597,...,0.131359,0.149906,0.27017,0.176289,0.115028,0.08567,0.099623,0.074589,0.062831,0.108848
10 Things I Hate About You,0.241464,0.074996,1.0,0.12177,0.213107,0.148263,0.092465,0.120639,0.314702,0.038903,...,0.251897,0.044438,0.100955,0.063457,0.040445,0.136318,0.180391,0.022832,0.072107,0.1365
"10,000 BC",0.126556,0.087457,0.12177,1.0,0.10909,0.071814,0.094253,0.121231,0.126269,0.069239,...,0.110842,0.046654,0.072829,0.094678,0.069171,0.068578,0.148535,0.049657,0.140346,0.203384
101 Dalmatians,0.102766,0.044924,0.213107,0.10909,1.0,0.118178,0.05896,0.074234,0.179013,0.024722,...,0.148296,0.03215,0.063366,0.057655,0.030357,0.070654,0.133436,0.022734,0.065987,0.108745


그러면 이제 특정 영화와 비교했을 때 그 영화와 유사한 영화들을 추천해주면 됩니다.

In [19]:
movie_sim_df["X-Men Origins: Wolverine"].sort_values(ascending=False)[1:11]

title
X-Men: The Last Stand                  0.493659
X-Men: First Class                     0.473584
Iron Man 2                             0.460773
The Incredible Hulk                    0.452307
Thor                                   0.444001
Captain America: The First Avenger     0.439527
The Wolverine                          0.434007
Transformers: Revenge of the Fallen    0.428829
Terminator Salvation                   0.428546
Hancock                                0.422578
Name: X-Men Origins: Wolverine, dtype: float64

In [20]:
movie_sim_df["Harry Potter and the Half-Blood Prince"].sort_values(ascending=False)[1:11]

title
Harry Potter and the Deathly Hallows: Part 1    0.748248
Harry Potter and the Order of the Phoenix       0.690604
Harry Potter and the Deathly Hallows: Part 2    0.675202
Harry Potter and the Goblet of Fire             0.633262
Harry Potter and the Prisoner of Azkaban        0.589819
Harry Potter and the Philosopher's Stone        0.505116
Harry Potter and the Chamber of Secrets         0.503854
The Hunger Games                                0.449095
The Hobbit: An Unexpected Journey               0.442008
Avatar                                          0.436682
Name: Harry Potter and the Half-Blood Prince, dtype: float64

In [21]:
movie_sim_df["King Kong"].sort_values(ascending=False)[1:11]

title
War of the Worlds                                                 0.470805
Spider-Man 2                                                      0.429493
Star Wars: Episode III - Revenge of the Sith                      0.414435
X-Men: The Last Stand                                             0.414001
Superman Returns                                                  0.411219
Pirates of the Caribbean: Dead Man's Chest                        0.404468
The Chronicles of Narnia: The Lion, the Witch and the Wardrobe    0.399526
Hellboy                                                           0.398361
I, Robot                                                          0.396771
Charlie and the Chocolate Factory                                 0.394558
Name: King Kong, dtype: float64

In [22]:
movie_sim_df["The Dark Knight Rises"].sort_values(ascending=False)[1:11]

title
The Avengers               0.609856
Django Unchained           0.574199
Inception                  0.571691
The Dark Knight            0.569218
Interstellar               0.520746
Guardians of the Galaxy    0.513397
Batman Begins              0.498658
X-Men: First Class         0.498474
Iron Man                   0.496751
Shutter Island             0.495844
Name: The Dark Knight Rises, dtype: float64

In [23]:
movie_sim_df["The Avengers"].sort_values(ascending=False)[1:11]

title
The Dark Knight Rises                  0.609856
Guardians of the Galaxy                0.599658
X-Men: First Class                     0.593308
Iron Man                               0.576998
Captain America: The Winter Soldier    0.572532
Iron Man 2                             0.571376
Iron Man 3                             0.567199
Captain America: The First Avenger     0.561280
X-Men: Days of Future Past             0.551384
Avengers: Age of Ultron                0.536395
Name: The Avengers, dtype: float64