데이터 사용을 위한 드라이브 마운트

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 단순 추천 시스템(Simple Recommender)

평점과 인기도에 따라 영화를 정렬하고 목록의 최상위 영화를 표시  

추가적으로 장르를 변수로 전달하여 특정 장르의 최상위 영화를 표시

In [2]:
# 그림을 노트북에서 볼 수 있도록 설정
%matplotlib inline

In [3]:
# 경고 문구 무시
import warnings
warnings.simplefilter('ignore') # 무시 = ignore / 표시 = default

데이터 형태를 확인

In [4]:
import pandas as pd

# 파일 경로
directory = '/content/drive/MyDrive/input/movies_metadata.csv'

# md = movie data
md = pd.read_csv(directory) # low_memory=False
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


csv의 데이터는 String 형으로 저장되어 있기 때문에  
비교적 안전한 literal_eval을 이용하여 dict 형으로 변환함

In [5]:
from ast import literal_eval

# null 값을 list로 변환
md['genres'] = md['genres'].fillna('[]')

# String 형을 List & Dictionary 형으로 변환
md['genres'] = md['genres'].apply(literal_eval)

# x가 list형인 경우 들어있는 dictionary 중 name에 해당하는 값을 list에 저장
md['genres'] = md['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

정상적으로 변환 되었는지 확인

In [6]:
md['genres'].head()

Unnamed: 0,genres
0,"[Animation, Comedy, Family]"
1,"[Adventure, Fantasy, Family]"
2,"[Romance, Comedy]"
3,"[Comedy, Drama, Romance]"
4,[Comedy]


In [7]:
print(md[['vote_count', 'vote_average']].head())

   vote_count  vote_average
0      5415.0           7.7
1      2413.0           6.9
2        92.0           6.5
3        34.0           6.1
4       173.0           5.7


In [8]:
# md의 vote_count 중 null이 아닌 열의 전체 데이터를 가져옴
# 가져온 열 중의 vote_count 열만 뽑아서 int 형으로 변환함
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

총 45460개의 영화 중 5%는 2273 번째 이다.  
`45460 * 0.05 = 2273`

In [9]:
print(vote_counts.sort_values(ascending=False)[2273:2274])

11561    434
Name: vote_count, dtype: int64


quantile : 데이터를 크기대로 정렬하였을 때 분위수를 구하는 함수로 quantile(0.95)는 상위 5%에 해당하는 값을 찾는 것이다.

In [10]:
m = vote_counts.quantile(0.95)
m

434.0

In [11]:
import numpy as np

In [12]:
print(md['release_date'].head())

# 'release_date'를 split해서 year만 추출
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

print('year ::: \n', md['year'].head())

0    1995-10-30
1    1995-12-15
2    1995-12-22
3    1995-12-22
4    1995-02-10
Name: release_date, dtype: object
year ::: 
 0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object


In [13]:
# 평가 수가 상위 5%인(434보다 큰) 데이터 추출
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [14]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [15]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [16]:
# Weighted Rating 상위 250개의 영화
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [17]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851924


In [18]:
s = md.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
print(s.head(10))

gen_md = md.drop('genres', axis=1).join(s)
print(gen_md.head(10))

0    Animation
0       Comedy
0       Family
1    Adventure
1      Fantasy
1       Family
2      Romance
2       Comedy
3       Comedy
3        Drama
Name: genre, dtype: object
   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
1  False                                                NaN  65000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
3  False                                                NaN  16000000   

                   

In [19]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title','year','vote_count','vote_average','popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')

    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)

    return qualified

In [20]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.565285
351,Forrest Gump,1994,8147,8,48.307194,7.971357
876,Vertigo,1958,1162,8,18.20822,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.845107,7.745154
1132,Cinema Paradiso,1988,834,8,14.177005,7.744878
19901,Paperman,2012,734,8,7.198633,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.994281,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


# 콘텐츠 기반 추천(Content Based Recommender)

특정 측정 항목을 기반으로 영화 간 유사성을 계산하고 사용자가 좋아하는 특정 영화와 가장 유사한 영화를 제안하는 엔진을 구축

영화 개요 및 태그 라인, 영화 출연진, 제작진, 키워드 및 장르를 기반으로 두 가지 콘텐츠 기반 추천 시스템을 구축

In [21]:
links_small = pd.read_csv('/content/drive/MyDrive/input/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
links_small.head()

Unnamed: 0,tmdbId
0,862
1,8844
2,15602
3,31357
4,11862


In [22]:
# Drop a row by index : 19730, 29503, 33587 행은 이상한 데이터들(md.iloc[19730], md.iloc[29503], md.iloc[33587])
md = md.drop([19730, 29503, 35587])

In [23]:
md['id'] = md['id'].astype('int')

In [24]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [25]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

smd['description'].head()

Unnamed: 0,description
0,"Led by Woody, Andy's toys live happily in his ..."
1,When siblings Judy and Peter discover an encha...
2,A family wedding reignites the ancient feud be...
3,"Cheated on, mistreated and stepped on, the wom..."
4,Just when George Banks has recovered from his ...


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# n-그램:단어장 생성에 사용할 토큰의 크기를 결정한다. 모노그램(1-그램)은 토큰 하나만 단어로 사용하며 바이그램(2-그램)은 두 개의 연결된 토큰을 하나의 단어로 사용한다.
# Stop Words:문서에서 단어장을 생성할 때 무시할 수 있는 단어를 말한다. 보통 영어의 관사나 접속사, 한국어의 조사 등이 여기에 해당한다. stop_words 인수로 조절할 수 있다.

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [27]:
print(tfidf_matrix[10])

  (0, 263896)	0.10019469108879508
  (0, 148481)	0.04868409542320117
  (0, 259711)	0.10755393398304396
  (0, 185549)	0.09065134912997763
  (0, 10484)	0.11361377972812747
  (0, 213976)	0.2692536716008833
  (0, 184517)	0.16690994746509125
  (0, 154477)	0.0664650153044035
  (0, 255905)	0.1431510425493934
  (0, 51806)	0.14118434571637972
  (0, 232767)	0.12636535034540433
  (0, 73227)	0.11778360077328379
  (0, 254988)	0.12212218310011227
  (0, 256936)	0.10099642406710589
  (0, 142127)	0.14118434571637972
  (0, 15707)	0.08551722235900833
  (0, 51582)	0.1375754875898496
  (0, 221296)	0.13213378623513558
  (0, 260302)	0.08551722235900833
  (0, 204257)	0.12997420847193447
  (0, 59409)	0.14583697304488694
  (0, 12513)	0.1375754875898496
  (0, 192601)	0.13213378623513558
  (0, 237762)	0.0807357273414165
  (0, 259727)	0.15239448296082497
  :	:
  (0, 10509)	0.15239448296082497
  (0, 213982)	0.15239448296082497
  (0, 264425)	0.13213378623513558
  (0, 184597)	0.14583697304488694
  (0, 154810)	0.145836

In [28]:
tfidf_matrix.shape

(9099, 268124)

In [29]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [30]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [31]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

print(titles.head(), indices.head())

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64


In [32]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [33]:
get_recommendations('The Godfather').head(10)

Unnamed: 0,title
973,The Godfather: Part II
8387,The Family
3509,Made
4196,Johnny Dangerously
29,Shanghai Triad
5667,Fury
2412,American Movie
1582,The Godfather: Part III
4221,8 Women
2159,Summer of Sam


In [34]:
get_recommendations('Inception').head(10)

Unnamed: 0,title
5239,Cypher
141,Crumb
6398,Renaissance
653,Lone Star
1703,House
4739,The Pink Panther
319,Cobb
2828,What Ever Happened to Baby Jane?
8867,Pitch Perfect 2
979,Once Upon a Time in America


In [35]:
credits = pd.read_csv('/content/drive/MyDrive/input/credits.csv')
keywords = pd.read_csv('/content/drive/MyDrive/input/keywords.csv')

In [36]:
credits['crew'][0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [37]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [38]:
md.shape

(45463, 25)

In [39]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')
md.shape

(46628, 28)

In [40]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [41]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [42]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [43]:
smd['director'] = smd['crew'].apply(get_director)

In [44]:
# 출연진 중 상위에 노출되는 3명만 추출
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)

In [45]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [46]:
# 출연진의 이름에서 공백 삭제
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [47]:
# 감독의 이름에서 공백 삭제 및 가중치 3배
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

In [48]:
s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [49]:
s = s.value_counts()
s[:5]

Unnamed: 0_level_0,count
keyword,Unnamed: 1_level_1
independent film,610
woman director,550
murder,399
duringcreditsstinger,327
based on novel,318


In [50]:
# 2번 이상 등장한 키워드만 추출
s = s[s > 1]

In [51]:
from nltk.stem.snowball import SnowballStemmer

# 어근 추출을 통해 동일 의미&다른 형태의 단어(dogs&dog, imaging&image 등)를 동일한 단어로 인식
stemmer = SnowballStemmer('english')
print("dogs의 어근 : ", stemmer.stem('dogs'))
print("dog의 어근 : ", stemmer.stem('dog'))

dogs의 어근 :  dog
dog의 어근 :  dog


In [52]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [53]:
# 키워드의 어근을 찾아서 공백 제거 후 세팅
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [54]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [55]:
count = CountVectorizer(analyzer='word', ngram_range=(1,2), min_df=0.0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [56]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [57]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [58]:
get_recommendations('The Dark Knight').head(10)

Unnamed: 0,title
7991,The Dark Knight Rises
6186,Batman Begins
6587,The Prestige
2077,Following
7608,Inception
4125,Insomnia
3373,Memento
8573,Interstellar
7619,Batman: Under the Red Hood
1122,Batman Returns


In [59]:
get_recommendations('Mean Girls').head(10)

Unnamed: 0,title
3311,Head Over Heels
4735,Freaky Friday
1321,The House of Yes
6245,Just Like Heaven
7865,Mr. Popper's Penguins
7292,Ghosts of Girlfriends Past
6923,The Spiderwick Chronicles
8855,The DUFF
6662,It's a Boy Girl Thing
7333,"I Love You, Beth Cooper"


In [60]:
def improved_recommendations(title):
    print(title)
    idx = indices[title]
    print(idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)

    movies = smd.iloc[movie_indices][['title','vote_count','vote_average','year']]
#     print(movies)

    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull())]
#     print(qualified)
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    print(qualified)
    return qualified

In [61]:
improved_recommendations('The Dark Knight')

The Dark Knight
6945
[7991, 6186, 6587, 2077, 7608, 4125, 3373, 8573, 7619, 1122, 8899, 5907, 1252, 9004, 4005, 5773, 7322, 7517, 7538, 7961, 2746, 132, 2123, 2440, 5070]
                                   title  vote_count  vote_average  year  \
7608                           Inception       14075           8.1  2010   
8573                        Interstellar       11187           8.1  2014   
3373                             Memento        4168           8.1  2000   
6587                        The Prestige        4510           8.0  2006   
7991               The Dark Knight Rises        9263           7.6  2012   
6186                       Batman Begins        7511           7.5  2005   
1122                      Batman Returns        1706           6.6  1992   
9004  Batman v Superman: Dawn of Justice        7189           5.7  2016   
132                       Batman Forever        1529           5.2  1995   
1252                      Batman & Robin        1447           4.2  1

Unnamed: 0,title,vote_count,vote_average,year,wr
7608,Inception,14075,8.1,2010,8.014597
8573,Interstellar,11187,8.1,2014,7.993373
3373,Memento,4168,8.1,2000,7.830744
6587,The Prestige,4510,8.0,2006,7.758148
7991,The Dark Knight Rises,9263,7.6,2012,7.494595
6186,Batman Begins,7511,7.5,2005,7.376814
1122,Batman Returns,1706,6.6,1992,6.32518
9004,Batman v Superman: Dawn of Justice,7189,5.7,2016,5.67409
132,Batman Forever,1529,5.2,1995,5.209926
1252,Batman & Robin,1447,4.2,1997,4.441087


In [62]:
improved_recommendations('Mean Girls')

Mean Girls
5179
[3311, 4735, 1321, 6245, 7865, 7292, 6923, 8855, 6662, 7333, 3696, 7450, 5510, 5135, 5064, 1539, 1997, 8812, 5124, 7048, 7392, 7652, 4968, 6417, 390]
                                        title  vote_count  vote_average  year  \
1539                       The Breakfast Club        2189           7.8  1985   
390                        Dazed and Confused         588           7.4  1993   
8855                                 The DUFF        1372           6.8  2015   
3696                     The Princess Diaries        1063           6.5  2001   
6245                         Just Like Heaven         595           6.5  2005   
6923                The Spiderwick Chronicles         593           6.3  2008   
4735                            Freaky Friday         919           6.0  2003   
7865                    Mr. Popper's Penguins         775           5.7  2011   
7292               Ghosts of Girlfriends Past         716           5.6  2009   
7450  American Pie Prese

Unnamed: 0,title,vote_count,vote_average,year,wr
1539,The Breakfast Club,2189,7.8,1985,7.377234
390,Dazed and Confused,588,7.4,1993,6.484819
8855,The DUFF,1372,6.8,2015,6.426293
3696,The Princess Diaries,1063,6.5,2001,6.136129
6245,Just Like Heaven,595,6.5,2005,5.970637
6923,The Spiderwick Chronicles,593,6.3,2008,5.854124
4735,Freaky Friday,919,6.0,2003,5.757786
7865,Mr. Popper's Penguins,775,5.7,2011,5.53663
7292,Ghosts of Girlfriends Past,716,5.6,2009,5.465987
7450,American Pie Presents: The Book of Love,454,5.1,2009,5.170817


# 협업 필터링(Collaborative Filtering)

In [69]:
# surprise 라이브러리 다운로드
!pip install scikit-surprise



In [None]:
# surprise 라이브러리의 Reader
from surprise import Reader, Dataset, SVD, accuracy

reader = Reader()

In [70]:
ratings = pd.read_csv('/content/drive/MyDrive/input/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [71]:
data = Dataset.load_from_df(ratings[['userId', 'movieId','rating']], reader)
# data.split(n_folds=5)

trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [72]:
svd = SVD()
# evaluate(svd, data, measures=['RMSE', 'MAE'])

####### 기존 커널대로 진행하면 오류나서 수정 #######
svd.fit(trainset)
predictions = svd.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6481


0.6481171063920719

In [73]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [74]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.5147457157691147, details={'was_impossible': False})

# 하이브리드 추천 시스템(Hybrid Recommender)

In [75]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [76]:
id_map = pd.read_csv('/content/drive/MyDrive/input/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

In [77]:
indices_map = id_map.set_index('id')

In [78]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']

    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]

    movies = smd.iloc[movie_indices][['title','vote_count','vote_average','year','id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [79]:
hybrid(1, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
999,The Terminator,4208.0,7.4,1984,218,3.303641
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,2.994195
962,Aliens,3282.0,7.7,1986,679,2.990003
8357,Star Trek Into Darkness,4479.0,7.4,2013,54138,2.928693
8622,X-Men: Days of Future Past,6155.0,7.5,2014,127585,2.874363
1613,Darby O'Gill and the Little People,35.0,6.7,1959,18887,2.868087
2006,Fantastic Planet,140.0,7.6,1973,16306,2.819425
1368,Titanic,7770.0,7.5,1997,597,2.815868
7052,Star Wars: The Clone Wars,434.0,5.8,2008,12180,2.792132
1660,Return from Witch Mountain,38.0,5.6,1978,14822,2.726955


In [80]:
hybrid(500, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.434733
999,The Terminator,4208.0,7.4,1984,218,3.337681
344,True Lies,1138.0,6.8,1994,36955,3.285641
962,Aliens,3282.0,7.7,1986,679,3.173475
910,The Abyss,822.0,7.1,1989,2756,3.163563
2006,Fantastic Planet,140.0,7.6,1973,16306,3.123085
8357,Star Trek Into Darkness,4479.0,7.4,2013,54138,3.108374
8622,X-Men: Days of Future Past,6155.0,7.5,2014,127585,3.072695
1613,Darby O'Gill and the Little People,35.0,6.7,1959,18887,3.059499
4001,Hawk the Slayer,13.0,4.5,1980,25628,3.059391


# 끝