In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval

In [2]:
data = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
data = data[['id', 'genres', 'keywords', 'title']]
data.head(2)

Unnamed: 0,id,genres,keywords,title
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar
1,285,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End


In [4]:
data['genres'] = data['genres'].apply(literal_eval)
data['keywords'] = data['keywords'].apply(literal_eval)
data.head(2)

Unnamed: 0,id,genres,keywords,title
0,19995,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 1463, 'name': 'culture clash'}, {'id':...",Avatar
1,285,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",Pirates of the Caribbean: At World's End


In [5]:
data['genres'] = data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))
data['keywords'] = data['keywords'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))
data.head(2)

Unnamed: 0,id,genres,keywords,title
0,19995,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Avatar
1,285,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,Pirates of the Caribbean: At World's End


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
c_vector_genres = cv.fit_transform(data['genres'])

In [7]:
print(f'CountVectorizer가 찾은 장르 갯수: {len(cv.get_feature_names())}')

CountVectorizer가 찾은 장르 갯수: 22


In [16]:
cv.get_feature_names()

['action',
 'adventure',
 'animation',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'fiction',
 'foreign',
 'history',
 'horror',
 'movie',
 'music',
 'mystery',
 'romance',
 'science',
 'thriller',
 'tv',
 'war',
 'western']

In [8]:
c_vector_genres.toarray()

array([[1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
# 4803개의 영화, 22개의 장르
c_vector_genres.shape

(4803, 22)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
similartiy_matrix = cosine_similarity(c_vector_genres, c_vector_genres)

In [11]:
similartiy_matrix

array([[1.        , 0.77459667, 0.51639778, ..., 0.        , 0.        ,
        0.        ],
       [0.77459667, 1.        , 0.66666667, ..., 0.        , 0.        ,
        0.        ],
       [0.51639778, 0.66666667, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [12]:
similartiy_matrix.shape

(4803, 4803)

In [13]:
titles = data['title']
indices = pd.Series(data.index, index=data['title'])

def get_recommend_movie_list(title, top=30):
    index = indices[title]
    sim_scores = list(enumerate(similartiy_matrix[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top+1]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [14]:
get_recommend_movie_list('Pirates of the Caribbean: At World\'s End',2 )

5                          Spider-Man 3
9    Batman v Superman: Dawn of Justice
Name: title, dtype: object

In [15]:
# 사용자에게 영화 리뷰를 입력받고 추천 리스트를 출력해주는 함수
def recommend():
    

SyntaxError: unexpected EOF while parsing (<ipython-input-15-3ff715fddfb5>, line 3)