In [None]:
# 영화의 overview를 TF-IDF를 통해 벡터화 시킨후
# 선택한 영화와 유사도가 높은 영화를 추천하는 프로그램

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
import os
print(os.listdir("C:/Users/chica/OneDrive/바탕 화면/it관련/data/movies/"))

['movies_metadata.csv', 'ratings.csv']


In [21]:
path  = 'C:/Users/chica/OneDrive/바탕 화면/it관련/data/movies/'

In [22]:
data = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [23]:
# columns 확인, overview 항목 추출
data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [None]:
# data 전처리
# overview에 결측피가 있는 항목 전부 제거

In [24]:
# null이 아닌 것들
data['overview'].notnull()

0        True
1        True
2        True
3        True
4        True
         ... 
45461    True
45462    True
45463    True
45464    True
45465    True
Name: overview, Length: 45466, dtype: bool

In [26]:
data = data[data['overview'].notnull()].reset_index(drop=True) # null 제외한 data의 index 재설정
data.shape

(44512, 24)

In [28]:
# 44512개의 data 중 20001개만 사용
data = data.loc[0:20000].reset_index(drop=True)

In [29]:
# 불용어: 유의미하지 않은 단어 토큰을 제거
tfidf = TfidfVectorizer(stop_words='english')

# overview에 대해서 tf-idf 수행
tfidf_matrix =tfidf.fit_transform(data['overview'])
print(tfidf_matrix.shape)

(20001, 47665)


In [None]:
# 20001개의 문서에서 단어 토큰 47665개가 생성

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_matrix

array([[1.        , 0.015775  , 0.        , ..., 0.00826973, 0.01721372,
        0.        ],
       [0.015775  , 1.        , 0.04921281, ..., 0.005673  , 0.00799   ,
        0.        ],
       [0.        , 0.04921281, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00826973, 0.005673  , 0.        , ..., 1.        , 0.01443662,
        0.        ],
       [0.01721372, 0.00799   , 0.        , ..., 0.01443662, 1.        ,
        0.01828417],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01828417,
        1.        ]])

In [31]:
# 20001개의 영화 overview 간의 유사도
cosine_matrix.shape

(20001, 20001)

In [None]:
# cosine_matrix의 인덱스는 단순 인덱스
# 자료의 순서를 의미할 뿐 영화의 id가 아님
# 자료해석의 용이함 위해 수정해 줄 필요가 있음

In [32]:
# movie title과 id를 매핑할 dictionary를 생성
movie2id = {}
for i, c in enumerate(data['title']):
    movie2id[i] = c

# id와 movie title을 매핑할 dictionary를 생성
id2moive = {}
for i, c in movie2id.items():
    id2moive[c] = i

In [33]:
movie2id

{0: 'Toy Story',
 1: 'Jumanji',
 2: 'Grumpier Old Men',
 3: 'Waiting to Exhale',
 4: 'Father of the Bride Part II',
 5: 'Heat',
 6: 'Sabrina',
 7: 'Tom and Huck',
 8: 'Sudden Death',
 9: 'GoldenEye',
 10: 'The American President',
 11: 'Dracula: Dead and Loving It',
 12: 'Balto',
 13: 'Nixon',
 14: 'Cutthroat Island',
 15: 'Casino',
 16: 'Sense and Sensibility',
 17: 'Four Rooms',
 18: 'Ace Ventura: When Nature Calls',
 19: 'Money Train',
 20: 'Get Shorty',
 21: 'Copycat',
 22: 'Assassins',
 23: 'Powder',
 24: 'Leaving Las Vegas',
 25: 'Othello',
 26: 'Now and Then',
 27: 'Persuasion',
 28: 'The City of Lost Children',
 29: 'Shanghai Triad',
 30: 'Dangerous Minds',
 31: 'Twelve Monkeys',
 32: 'Babe',
 33: 'Carrington',
 34: 'Dead Man Walking',
 35: 'Across the Sea of Time',
 36: 'It Takes Two',
 37: 'Clueless',
 38: 'Cry, the Beloved Country',
 39: 'Richard III',
 40: 'Dead Presidents',
 41: 'Restoration',
 42: 'Mortal Kombat',
 43: 'To Die For',
 44: 'How To Make An American Quilt',
 

In [36]:
# Toy Story를 추출했을때 유사도 확인
idx = id2moive['Toy Story']
idx

0

In [37]:
cosine_matrix[0]

array([1.        , 0.015775  , 0.        , ..., 0.00826973, 0.01721372,
       0.        ])

In [39]:
sim_scores = [(i,c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출
sim_scores

[(1, 0.01577499623706559),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.041138683296865486),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0099121496903153),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.01978034381431984),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.006321775635368981),
 (42, 0.0),
 (43, 0.0),
 (44, 0.009292791126667362),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.013838678611953216),
 (49, 0.009852367947354567),
 (50, 0.010928162091485132),
 (51, 0.0),
 (52, 0.0),
 (53, 0.02000467244181858),
 (54, 0.0),
 (55, 0.025263801435198463),
 (56, 0.02072192444202655),
 (57, 0.0),
 (58, 0.03420184247473588),
 (59, 0.0),
 (60, 0.0),
 (61, 0.00860353886947865),
 (62, 0.0),
 (63, 0.01019819462957017),
 (64, 0

In [41]:
# 유사도가 높은 순서로 정렬
sim_scores = sorted(sim_scores, key= lambda x: x[1],reverse=True)
sim_scores[0:10] # 상위 10개

[(15282, 0.5262275451171008),
 (2979, 0.463276799830381),
 (10271, 0.2797390476075632),
 (8303, 0.20078538664316947),
 (1058, 0.18287334034120212),
 (11367, 0.15712074193481165),
 (1916, 0.15288512626542436),
 (3039, 0.1433450408051554),
 (483, 0.13765225108436677),
 (11573, 0.1337032693869044)]

In [42]:
# id를 title로 변환
sim_scores = [(movie2id[i], score) for i , score in sim_scores[0:10]]
sim_scores

[('Toy Story 3', 0.5262275451171008),
 ('Toy Story 2', 0.463276799830381),
 ('The 40 Year Old Virgin', 0.2797390476075632),
 ('The Champ', 0.20078538664316947),
 ('Rebel Without a Cause', 0.18287334034120212),
 ('For Your Consideration', 0.15712074193481165),
 ('Condorman', 0.15288512626542436),
 ('Man on the Moon', 0.1433450408051554),
 ('Malice', 0.13765225108436677),
 ('Factory Girl', 0.1337032693869044)]