In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

# 1. Simple Recommender
## 장르를 이용한 추천

In [2]:
import os 
print(os.getcwd())
os.chdir("/home/piai")

/home/piai/POSCO-AI-Big-Data


In [3]:
md =  pd.read_csv("Data/movies_metadata.csv")
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [5]:
## 장르 데이터 리스트로 변경
md['genres'] = md['genres'].fillna('[]').apply(literal_eval)\
.apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

md['production_companies']= md['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['production_countries'] = md['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['spoken_languages'] = md['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)


In [67]:
md.loc['popularity']

KeyError: 'popularity'

### IMDB's 가중치 비율
$$Weighted Rating(WR) = ({\frac{v}{v+m}\cdot R}) + ({\frac{m}{v+m}\cdot C}) $$

- v: 무비 리뷰 수
- m: 차트에 들어가기 위한 최소 리뷰 수
- R: 영화의 평균 rating
- C: 전체 리포트의 vote평균

In [7]:
vote_count = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_average = md[md['vote_average'].notnull()]['vote_average'].astype('int')

#quanitle은 4분위 데이터를 말한건데 0.95는 95% 부터의 데이터를 의미한다.
m = vote_count.quantile(0.96) 
C = vote_average.mean()

In [8]:
print(m, "/",C)

576.6399999999994 / 5.244896612406511


In [9]:
## qualfied df생성
qualified = md[(md['vote_count'].notnull()) & (md['vote_average'].notnull()) & 
               (md['vote_count'] >= m)][['title', 'year', 'vote_count',
                                         'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype(int)
qualified['vote_average'] = qualified['vote_average'].astype(int)

In [10]:
## 점수를 만드는 함수
def weight_rating(x):
    v = x['vote_count']
    r = x['vote_average']
    return (v/(v+m)*r) + (m/(m+v)*C)

In [11]:
## qualified['wr']을 만들어서 해당 칼럼을 기준으로 정렬한다.(weight_rating)
qualified['wr'] = qualified.apply(weight_rating, axis=1)
qualified = qualified.sort_values(by='wr', ascending=False).head(250)

In [12]:
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.891568
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.876324
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.864948
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.845075
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.832214


In [65]:
## genre라는 데이터를 빼서 이를 통해 한 데이터에 몰아넣는다.
s = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
# gen_md == genre movie data
gen_md = md.drop("genres", axis=1).join(s)

In [64]:
gen_md.head(10)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,[English],Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,[English],Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,[English],Released,,Toy Story,False,7.7,5415.0,1995,Family
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Adventure
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Fantasy
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Family
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,Romance
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,Comedy
3,False,,16000000,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,Comedy
3,False,,16000000,,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,Drama


In [14]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) 
                   & (df['vote_average'].notnull())]\
                    [['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

### 결과

In [15]:
### top romance movie
build_chart('Horror').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1213,The Shining,1980,3890,8,19.6116,7.901294
1176,Psycho,1960,2405,8,36.8263,7.843335
1171,Alien,1979,4564,7,23.3774,6.941936
41492,Split,2016,4461,7,28.920839,6.940631
14236,Zombieland,2009,3655,7,11.063,6.927969
1158,Aliens,1986,3282,7,21.7612,6.920081
21276,The Conjuring,2013,3169,7,14.9017,6.917338
42169,Get Out,2017,2978,7,36.894806,6.912248
1338,Jaws,1975,2628,7,19.7261,6.901088
8147,Shaun of the Dead,2004,2479,7,14.9029,6.895426


# 2. 콘텐츠 기반 알고리즘

## based on
- Movie Overviews and Taglines
- Movie Cast, Crew, Keywords and Genre

In [16]:
links_small = pd.read_csv("Data/links_small.csv")
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [17]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [18]:
links_small.head()

0      862
1     8844
2    15602
3    31357
4    11862
Name: tmdbId, dtype: int64

In [19]:
# 결측치
md = md.drop([19730, 29503, 35587])

In [20]:
# 새로운 데이터 만들기 tmdbid 연결
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [21]:
## 문장 분석을 위한 vectorized
# 결측치 제거

smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

$$tf - idf(d, t) = tf(d, t) \cdot idf(t)$$

In [22]:
# min_df는 토큰 무시 정도
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2), 
                     min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [23]:
tfidf_matrix.shape

(9099, 268124)

### Cosine Similarty
- 두 영화 사이의 유사성 계산
-$cosine(x, y) - (x.y^T)/(||x|| \cdot||y||)$

In [24]:
# 해당 코드의 변경으로 인해 업데이트 됨
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [25]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

## 추천 함수 (줄거리)

In [26]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:31] # 입력된 값과 비슷한 몇 개만 가져오
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [27]:
movie='3 Idiots'
print("Description of the Movie: ", movie)
print('---------------------------------------------------------------------')
print(smd[smd['title']==movie]['overview'])

Description of the Movie:  3 Idiots
---------------------------------------------------------------------
7422    In the tradition of “Ferris Bueller’s Day Off”...
Name: overview, dtype: object


In [28]:
get_recommendations('3 Idiots').head(10)

2336                             Ferris Bueller's Day Off
8161                                  Student of the Year
262                                              Outbreak
2658                                  The Next Best Thing
4378    Come Back to the 5 & Dime, Jimmy Dean, Jimmy Dean
1861                                   Enemy of the State
3098                                          Bring It On
7866                                            Contagion
4543                                    What a Girl Wants
5373                                              College
Name: title, dtype: object

In [29]:
movie='The Dark Knight'
print("Description of the Movie: ", movie)
print('---------------------------------------------------------------------')
print(smd[smd['title']==movie]['overview'])

Description of the Movie:  The Dark Knight
---------------------------------------------------------------------
6900    Batman raises the stakes in his war on crime. ...
Name: overview, dtype: object


In [30]:
get_recommendations('The Dark Knight').head(20)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
7242                  The File on Thelma Jordon
3537                               Criminal Law
2893                              Flying Tigers
1135                   Night Falls on Manhattan
8680                          The Young Savages
Name: title, dtype: object

### 감독, 배우 등을 고려해서 추천

In [31]:
credits = pd.read_csv('Data/credits.csv', error_bad_lines=False, engine = "python")
keywords = pd.read_csv('Data/keywords.csv', error_bad_lines=False, engine = "python")

In [32]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [33]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [34]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

## 데이터 개수 확인 
- 안에 있는 지를 확인 .isin

In [35]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [36]:
#cast데이터를 ''문자열안에 있는 document를 진짜 document로 만든다.
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [37]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [38]:
# 감독 이름 넣기
smd['director'] = smd['crew'].apply(get_director)

# 배우 이름 넣기
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# 가장 영향력 있는 3명의 배우만 넣기
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

# 키워드도 넣어준다.
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [39]:
# 단어들을 원형으로 만들어주고 띄어쓰기를 없앤다
## director는 강조를 위해 세 번 반복해서 적는다.

smd['cast'] = smd['cast'].apply(lambda x : [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype(str).apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

### Keyword 전처리

In [40]:
## keywords라는 인덱스를 기준으로 각 영화마다의 장르를 동일한 인덱스로 표현할 수 있다.
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keywords'
s = s.value_counts()

In [41]:
s[:10]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
violence                264
love                    222
musical                 219
sex                     219
suspense                212
Name: keywords, dtype: int64

In [42]:
# 키워드가 한 개밖에 없는 것은 제거
s = s[s>1]

In [43]:
# stem을 통해 원형을 받을 수 있다.
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [44]:
# 키워들르 골라내 인덱스에 담는다.
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [45]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)

In [46]:
# 단어를 원형으로 만들어주고 띄어쓰기를 없애준다.

smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [47]:
smd['keywords']

0        [jealousi, toy, boy, friendship, friend, rival...
1        [boardgam, disappear, basedonchildren'sbook, n...
2                   [fish, bestfriend, duringcreditssting]
3        [basedonnovel, interracialrelationship, single...
4        [babi, midlifecrisi, confid, age, daughter, mo...
                               ...                        
40952                                         [friendship]
41172                                          [bollywood]
41225                                          [bollywood]
41391     [monster, godzilla, giantmonst, destruct, kaiju]
41669                                 [music, documentari]
Name: keywords, Length: 9219, dtype: object

In [48]:
# 전체적인 데이터들을 합쳐 soup로 만들어준다.

smd['soup'] = smd['keywords']+smd['cast']+smd['director']+smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [49]:
# 단어별로 벡터화 해주기

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [50]:
# 코사인 유사도

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [51]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [52]:
get_recommendations('The Dark Knight').head(15)

8031                 The Dark Knight Rises
6218                         Batman Begins
6623                          The Prestige
2085                             Following
7648                             Inception
4145                              Insomnia
3381                               Memento
8613                          Interstellar
7659            Batman: Under the Red Hood
1134                        Batman Returns
8927               Kidnapping Mr. Heineken
5943                              Thursday
1260                        Batman & Robin
9024    Batman v Superman: Dawn of Justice
4021                  The Long Good Friday
Name: title, dtype: object

### 리뷰 수 기반으로 버림

In [53]:
## 영화 추천 중에 쓰레기 같은 영화는 버린다.

def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weight_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [54]:
improved_recommendations('The Dark Knight').head(15)

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.891568
8613,Interstellar,11187,8,2014,7.864948
6623,The Prestige,4510,8,2006,7.687671
3381,Memento,4168,8,2000,7.665158
8031,The Dark Knight Rises,9263,7,2012,6.897144
6218,Batman Begins,7511,7,2005,6.874863
1134,Batman Returns,1706,6,1992,5.809246
132,Batman Forever,1529,5,1995,5.067066
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.018185
1260,Batman & Robin,1447,4,1997,4.354736


## 3. 협업 필터링

In [55]:
reader = Reader()
ratings = pd.read_csv('Data/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [56]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'],cv=5)

{'test_rmse': array([0.89665857, 0.89161202, 0.90033477, 0.8949203 , 0.89760508]),
 'test_mae': array([0.69093663, 0.6864709 , 0.69087817, 0.68989363, 0.69144445]),
 'fit_time': (4.1287431716918945,
  4.140778541564941,
  4.130016803741455,
  4.152634382247925,
  4.1792895793914795),
 'test_time': (0.12847495079040527,
  0.12868213653564453,
  0.2648754119873047,
  0.1297614574432373,
  0.12995696067810059)}

- rmse가 0.8944이면 우리가 하려는 데이터에 충분하다.
- 이제 우리의 데이터 셋을 예측을 위해 train한다.

In [57]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f589f1aaa90>

In [58]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [59]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.6121908342850695, details={'was_impossible': False})

In [60]:
## user_평점 만든다.

user_rating=pd.merge(ratings,md,left_on='movieId',right_on='id',how='inner')
user_ratings_final=user_rating[['userId', 'movieId', 'rating','original_title']]
user_ratings=user_ratings_final.sort_values(by='userId')
user_ratings.head()

Unnamed: 0,userId,movieId,rating,original_title
0,1,1371,2.5,Rocky III
93,1,2105,4.0,American Pie
140,1,2193,2.0,My Tutor
47,1,1405,1.0,Greed
182,1,2294,2.0,Jay and Silent Bob Strike Back


In [61]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [62]:
id_map = pd.read_csv('../input/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

FileNotFoundError: [Errno 2] No such file or directory: '../input/links_small.csv'

In [None]:
indices_map = id_map.set_index('id')

In [None]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [None]:
hybrid(1, 'Avatar')