# 유사도를 이용한 추천 시스템 구현하기

- kaggle data : https://www.kaggle.com/rounakbanik/the-movies-dataset?select=movies_metadata.csv
- TF-IDF와 코사인 유사도

In [113]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv('movies_metadata.csv', low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [114]:
data[['original_title','overview']][:2]

Unnamed: 0,original_title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...


In [115]:
data.iloc[:2,[8,9]]

Unnamed: 0,original_title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...


In [116]:
data.shape

(45466, 24)

In [117]:
# data = data.head(20000)

In [118]:
data.overview[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

- TF-IDF를 연산할 때 데이터에 Null 값이 들어있으면 에러가 발생

In [119]:
data.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [120]:
# overview 열에 존재하는 모든 결측값을 전부 카운트하여 출력
print('overview 열의 결측값의 수:',data['overview'].isnull().sum())

overview 열의 결측값의 수: 954


In [121]:
#NaN 값 안셈
data.overview.count()

44512

In [122]:
data.overview.size

45466

In [123]:
data['overview'].notnull().sum()

44512

In [124]:
# 전처리 
# overview의 결측치가 있는 항목은 모두 제거 
data = data[data['overview'].notnull()].reset_index(drop=True)
data.shape

(44512, 24)

In [125]:
# overview 복사해서 사용
data_overview = data['overview'].copy()

In [126]:
# 결측값을 빈 값으로 대체
data['overview'] = data_overview.fillna('')

In [127]:
data.shape

(44512, 24)

## TF-IDF(Term Frequency-Inverse Document Frequency)

TF-IDF(Term Frequency-Inverse Document Frequency)는 단어의 빈도와 역 문서 빈도(문서의 빈도에 특정 식을 취함)를 사용하여 DTM 내의 각 단어들마다 중요한 정도를 가중치로 주는 방법vector = CountVectorizer()

In [128]:
tfidfv = TfidfVectorizer().fit(data['overview'])
print(tfidfv.transform(data['overview']).toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [129]:
tfidfv_dic = tfidfv.vocabulary_
# for k, v in tfidfv_dic.items():
#     print("key: {}, value: {}".format(k, v))
print('값 : {}'.format(tfidfv_dic.get('woody')))

값 : 73767


In [130]:
# overview열에 대해서 TF-IDF 행렬을 구한 후 행렬의 크기를 출력
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)

TF-IDF 행렬의 크기(shape) : (44512, 75827)


In [131]:
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## 코사인 유사도(Cosine Similarity)

In [132]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print('코사인 유사도 연산 결과 :',cosine_sim.shape)

코사인 유사도 연산 결과 : (44512, 44512)


In [133]:
import numpy as np
np.round(cosine_sim,5)

array([[1.     , 0.01502, 0.     , ..., 0.     , 0.00593, 0.     ],
       [0.01502, 1.     , 0.0468 , ..., 0.     , 0.02196, 0.00924],
       [0.     , 0.0468 , 1.     , ..., 0.     , 0.01401, 0.     ],
       ...,
       [0.     , 0.     , 0.     , ..., 1.     , 0.     , 0.     ],
       [0.00593, 0.02196, 0.01401, ..., 0.     , 1.     , 0.     ],
       [0.     , 0.00924, 0.     , ..., 0.     , 0.     , 1.     ]])

In [134]:
# movie title와 id를 매핑할 dictionary를 생성
movie2id = {}
for i, c in enumerate(data['title']): movie2id[i] = c

In [135]:
# id와 movie title를 매핑할 dictionary를 생성 
id2movie = {}
for i, c in movie2id.items(): id2movie[c] = i

In [None]:
movie2id.items()

In [None]:
movie2id.values()

In [138]:
# Toy Story의 id 추출 
idx = id2movie['Toy Story'] # Toy Story : 0번 인덱스 

In [139]:
# 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
sim_scores = [(i, c) for i, c in enumerate(cosine_sim[idx]) if i != idx]

In [140]:
# 유사도가 높은 순서대로 정렬 
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)

In [141]:
# 상위 10개의 인덱스와 유사도를 추출 
sim_scores[0:10]

[(15282, 0.5321733978946077),
 (2979, 0.47214559370670484),
 (10271, 0.274962516260823),
 (24316, 0.27322653023092314),
 (23646, 0.23543946958082806),
 (28893, 0.22397858775140161),
 (42572, 0.21761842522811847),
 (37778, 0.2159367770908928),
 (41893, 0.20190977282766223),
 (8303, 0.19868494439439036)]

In [142]:
# 인덱스를 Movie Title로 변환 
sim_scores = [(movie2id[i], score) for i, score in sim_scores[0:10]]
sim_scores

[('Toy Story 3', 0.5321733978946077),
 ('Toy Story 2', 0.47214559370670484),
 ('The 40 Year Old Virgin', 0.274962516260823),
 ('Small Fry', 0.27322653023092314),
 ("Andy Hardy's Blonde Trouble", 0.23543946958082806),
 ('Hot Splash', 0.22397858775140161),
 ('Andy Kaufman Plays Carnegie Hall', 0.21761842522811847),
 ('Superstar: The Life and Times of Andy Warhol', 0.2159367770908928),
 ('Andy Peters: Exclamation Mark Question Point', 0.20190977282766223),
 ('The Champ', 0.19868494439439036)]

In [143]:
title_to_index = dict(zip(data['title'], data.index))

# 영화 제목 Father of the Bride Part II의 인덱스를 리턴
idx = title_to_index['Father of the Bride Part II']
print(idx)

4


In [144]:
# title_to_index
title_to_index.get('Father of the Bride Part II')

4

In [145]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # 선택한 영화의 타이틀로부터 해당 영화의 인덱스를 받아온다.
    idx = title_to_index[title]

    # 해당 영화와 모든 영화와의 유사도를 가져온다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬한다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 영화를 받아온다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 영화의 인덱스를 얻는다.
    movie_indices = [idx[0] for idx in sim_scores]

    # 가장 유사한 10개의 영화의 제목을 리턴한다.
    return data['title'].iloc[movie_indices]

In [146]:
get_recommendations('Toy Story')

15282                                     Toy Story 3
2979                                      Toy Story 2
10271                          The 40 Year Old Virgin
24316                                       Small Fry
23646                     Andy Hardy's Blonde Trouble
28893                                      Hot Splash
42572                Andy Kaufman Plays Carnegie Hall
37778    Superstar: The Life and Times of Andy Warhol
41893    Andy Peters: Exclamation Mark Question Point
8303                                        The Champ
Name: title, dtype: object