In [1]:
import pandas as pd

# 1. Data

In [53]:
path = "datas/tmdb_5000/tmdb_5000_movies.csv"

df = pd.read_csv(path)
crow, ccol = df.shape
print("[count] rows : {}, columns : {}".format(crow, ccol))
df.head(1)

[count] rows : 4803, columns : 20


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


# 2. Data Preprocessing

## 1. Column for using

In [54]:
cols = ['id', 'genres', 'vote_average', 'vote_count', 'popularity', 'title', 'keywords', 'overview']
data = df[cols]

crow, ccol = data.shape
print("[count] rows : {}, columns : {}".format(crow, ccol))
data.head(1)

[count] rows : 4803, columns : 8


Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di..."


## 2. Vote Preprocessing
- 평균평점이 불공정하게 처리되어 있음.
- weight rating으로 평점을 다시 처리해주는 작업

In [55]:
# 500위 안에 데이터가 들어와야 할 때의 vote_count
m = data['vote_count'].quantile(0.9)
print("vote_count : {}".format(m))

data = data.loc[data['vote_count'] >= m]
crow, ccol = data.shape
print("[count] rows : {}, columns : {}".format(crow, ccol))
data.head(1)

vote_count : 1838.4000000000015
[count] rows : 481, columns : 8


Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di..."


In [56]:
c = data['vote_average'].mean()
print("vote_average : {}".format(c))

vote_average : 6.9629937629937615


In [57]:
def weight_rating(x, m=m, c=c):
    v = x['vote_count']
    r = x['vote_average']
    
    return (v / (v + m) * r) + (m / (v + m) * c)

In [58]:
data['score'] = data.apply(weight_rating, axis=1)
crow, ccol = data.shape
print("[count] rows : {}, columns : {}".format(crow, ccol))
data.head(1)

[count] rows : 481, columns : 9


Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800,150.437577,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",7.168053


## 3. Column Extraction

In [59]:
import ast
# ast.literal_eval : 문자열로 표현된 실제 식으로 표현해주는 기능
exam = ast.literal_eval("{'a': 1234, 'b': 'hello'}")
print(exam)

{'a': 1234, 'b': 'hello'}


In [60]:
ex_col = ['genres', 'keywords']

for c in ex_col:
    data[c] = data[c].apply(ast.literal_eval)
    
# 문자열로 정의된 dict를 실제 dict로 바꾸는 작업
data[ex_col].head(1)

Unnamed: 0,genres,keywords
0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 1463, 'name': 'culture clash'}, {'id':..."


In [62]:
name_parse = lambda x: [d['name'] for d in x]
str_join = lambda x: " ".join(x)

for c in ex_col:
    data[c] = data[c].apply(name_parse).apply(str_join)
    
data.head(1)

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
0,19995,Action Adventure Fantasy Science Fiction,7.2,11800,150.437577,Avatar,culture clash future space war space colony so...,"In the 22nd century, a paraplegic Marine is di...",7.168053


# 3. Contents Based Filtering

## 1. Genre Vectorization

### 1. sklearn.feature_extraction.text.CountVectorizer 간단 소개

In [143]:
from sklearn.feature_extraction.text import CountVectorizer

# ngram_range = (1,3)
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]
vect = CountVectorizer()
# 1. 훈련 시키면 문서에 존재하는 각 각의 단어에 토큰을 부여한다.
vect.fit(corpus)
print(vect.vocabulary_)

# 2. 출현빈도 카운트, BoW(Bag of Words) Encoding Vector 반환
# This의 토큰(9)에 True(1) 처리, is의 토큰(3)에 True 처리, the의 토큰(7)에 True 처리, second의 토큰(6)에 True 처리
# document의 토큰(1)에 True(1) 처리
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# [0, 1, 0, 1, 0, 0, 1, 1, 0, 1]
print(vect.transform(['This is the second document.']).toarray())
print(vect.transform(['i am th']).toarray())

{'this': 9, 'is': 3, 'the': 7, 'first': 2, 'document': 1, 'second': 6, 'and': 0, 'third': 8, 'one': 5, 'last': 4}
[[0 1 0 1 0 0 1 1 0 1]]
[[0 0 0 0 0 0 0 0 0 0]]


### 2. 예제 적용

In [144]:
vect = CountVectorizer(ngram_range=(1,3))

print("Input Value : {} ...".format(data['genres'].values[:2]))
vect.fit(data['genres'])
print("Genre word by Token : {} ...\n".format(str(vect.vocabulary_)[:80]))

# 데이터 적용, BoW(Bag of Words) Encoding Vector 변환
genre_vector = vect.transform(data['genres']).toarray()
print("title | encoding vector")
for idx, title in enumerate(data['title'][:2]):
    print(title, genre_vector[idx][:10])

Input Value : ['Action Adventure Fantasy Science Fiction' 'Adventure Fantasy Action'] ...
Genre word by Token : {'action': 0, 'adventure': 48, 'fantasy': 225, 'science': 326, 'fiction': 259, ' ...

title | encoding vector
Avatar [1 1 0 0 0 1 0 0 0 0]
Pirates of the Caribbean: At World's End [1 0 0 0 0 0 0 0 0 0]


## 2. Similarity Value Extraction

In [145]:
from sklearn.metrics.pairwise import cosine_similarity as cos

# argsort 는 각 각의 영화들의 모든 영화들과의 배열안에서 적용되며, 오름차순으로 정렬된다.
# cosine_similarity는 높을수록 유사도가 높은 것이기 때문에 ::-1로 모든 행에 대하여 reverse를 적용해준다.
# 각 각의 영화는 자신과 장르상으로 유사한 영화들의 인덱스를 순위로 가지고 있게 되는 것
sim_genre = cos(genre_vector, genre_vector).argsort()[:,::-1]
crow, ccol = sim_genre.shape
print("[count] rows : {}, columns : {}".format(crow, ccol))
sim_genre

[count] rows : 481, columns : 481


array([[  0,  13,  42, ..., 298, 297, 240],
       [ 11,   1, 200, ..., 329, 330, 240],
       [  2, 376, 216, ..., 314, 304, 240],
       ...,
       [478, 187,  12, ..., 326, 327,   0],
       [479, 466, 383, ..., 220, 224,   0],
       [480, 468, 294, ..., 246, 248,   0]])

## 3. Usage

In [152]:
def recommend_movie_list(df, movie_title, top=30):
    idx = data[data['title'] == title].index[0]
    
    # sim_idx : 위에서 구한 장르 유사도 데이터 이용
    sim_idx = sim_genre[idx][1:top + 1]
    
    return data.iloc[sim_idx]

In [187]:
# score 내림차순 정렬 수행
ran_movie = reco_data.sample().iloc[0]

print("영화 '{}'에 대한 추천 데이터".format(ran_movie['title']))
reco_data = recommend_movie_list(data, movie_title=ran_movie['title']).sort_values('score', ascending=False)
reco_data.head(3)

영화 '2012'에 대한 추천 데이터


Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,keywords,overview,score
85,100402,Action Adventure Science Fiction,7.6,5764,72.225265,Captain America: The Winter Soldier,washington d.c. future shield marvel comic sup...,After the cataclysmic events in New York with ...,7.44596
46,127585,Action Adventure Fantasy Science Fiction,7.5,6032,118.078691,X-Men: Days of Future Past,1970s mutant time travel marvel comic based on...,The ultimate X-Men ensemble fights a war for t...,7.374564
158,13475,Science Fiction Action Adventure,7.4,4518,73.616808,Star Trek,spacecraft teleportation space mission parachu...,The fate of the galaxy rests in the hands of b...,7.273609


## 4. 장르 빈도 확인

In [191]:
import numpy as np

features = vect.get_feature_names_out()
count = vect.transform(reco_data['genres']).toarray().sum(axis=0)

count_df = pd.DataFrame(np.column_stack([features, count]),columns=['genres (1,3)', 'count'])

print("영화 '{}'의 장르 데이터".format(ran_movie['title']))
print(ran_movie['genres'])
count_df.sort_values(['count'], ascending=False)[:10]

영화 '2012'의 장르 데이터
Action Adventure Science Fiction


Unnamed: 0,"genres (1,3)",count
48,adventure,30
0,action,29
1,action adventure,28
326,science,23
327,science fiction,23
259,fiction,23
225,fantasy,14
79,adventure fantasy,12
5,action adventure fantasy,10
88,adventure science,10
