# 장르/키워드 데이터 가공

kaggle의 The Movies Dataset을 사용한다.
참고 교재인 '파이썬 머신러닝 완벽 가이드'에서는 다른 데이터셋을 사용하기에 데이터 가공 과정은 약간 달리한다.

In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

movies = pd.read_csv('movies_metadata.csv')
print(movies.shape)
movies.head(1)

#데이터셋 로드

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [2]:
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'overview']] #필요한 요소만 뽑아오기

In [3]:
pd.set_option('max_colwidth', 100)
movies_df[['genres']][:1]

#데이터 한 줄 살펴보기 

Unnamed: 0,genres
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"


In [4]:
import json

with open('keywords.json', encoding='utf-8') as f:
  js = json.loads(f.read())
keywords_df = pd.DataFrame(js)

#json으로 미리 변환해둔 csv 파일을 열며 바로 데이터프레임으로 만들기

In [5]:
pd.set_option('max_colwidth', 100)
keywords_df[['keywords']][:1]

#마찬가지로 데이터 형식 한 줄 뽑아서 살펴보기

Unnamed: 0,keywords
0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id..."


In [6]:
movies_df.id = pd.to_numeric(movies_df.id, errors='coerce')
keywords_df.id = pd.to_numeric(keywords_df.id, errors='coerce')

#둘 다 id를 숫자형으로 바꾸기

movies_metadata.csv의 'genres' 항목과 keywords.csv(.json)의 'keywords' 항목을 한 데이터프레임에 합치려 한다. 이때 공통되는 id를 이용해 합쳐야 하므로 데이터 형식을 숫자형으로 맞추었다.

In [7]:
movies_df = pd.merge(movies_df, keywords_df, on = 'id', how = 'inner') #영화 데이터프레임과 키워드 데이터프레임을 id로 합치기

movies_df.head() #형식 확인

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,overview,keywords
0,862.0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",7.7,5415.0,21.946943,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear o...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id..."
1,8844.0,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]",6.9,2413.0,17.015539,When siblings Judy and Peter discover an enchanted board game that opens the door to a magical w...,"[{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'nam..."
2,15602.0,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]",6.5,92.0,11.7129,A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John...,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': '..."
3,31357.0,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",6.1,34.0,3.859495,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusi...","[{'id': 818, 'name': 'based on novel'}, {'id': 10131, 'name': 'interracial relationship'}, {'id'..."
4,11862.0,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",5.7,173.0,8.387519,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she'...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'name': 'midlife crisis'}, {'id': 2246, 'name': 'con..."


여기서 볼 수 있듯, genre와 keyword 모두 [리스트] 안에 {'key': value 형식의 딕셔너리}형태로 존재한다. 우리는 여기서 value 값들만 따로 사용해야 한다.

In [8]:
from ast import literal_eval

movies_df['genres']=movies_df['genres'].apply(literal_eval)
movies_df['keywords']=keywords_df['keywords'].apply(literal_eval) #genres, keywords 칼럼의 문자열을 리스트 객체로 변환
movies_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]","[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id..."


In [9]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46482 entries, 0 to 46481
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            46482 non-null  float64
 1   title         46478 non-null  object 
 2   genres        46482 non-null  object 
 3   vote_average  46478 non-null  float64
 4   vote_count    46478 non-null  float64
 5   popularity    46478 non-null  object 
 6   overview      45487 non-null  object 
 7   keywords      46419 non-null  object 
dtypes: float64(3), object(5)
memory usage: 3.2+ MB


In [10]:
movies_df['genres']=movies_df['genres'].apply(lambda x : [y['name'] for y in x]) #name 키에 해당하는 value 추출하여 리스트 객체로 변환

In [11]:
movies_df['keywords'].head()

0    [{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id...
1    [{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'nam...
2    [{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': '...
3    [{'id': 818, 'name': 'based on novel'}, {'id': 10131, 'name': 'interracial relationship'}, {'id'...
4    [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'name': 'midlife crisis'}, {'id': 2246, 'name': 'con...
Name: keywords, dtype: object

In [12]:
movies_df['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

In [24]:
key_list=[]

lines = movies_df['keywords']

for i in lines: # 라인별로 읽어옴
    for j in i: # 내부에서 쉼표로 구분
        if 'name' in j: # 문자열에 name이 포함된 경우
            if ']' in j: # 마지막 라인의 경우
                key_list.append(j(iloc[10:-3]).values)
            else:
                key_list.append(j(iloc[10:-2]).values)

key_list[['keywords']][:1]

NameError: name 'iloc' is not defined

In [None]:
movies_df[['genres', 'keywords']][:1]

# 장르 콘텐츠 유사도 측정

In [None]:
from sklearn.feature_extraction.text import CounterVectorizer

movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat=count_vect.fit_transform(movies_df['genre_literal'])
print(genre_mat.shape)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:1])

In [None]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title']] == title_name]
    
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]
    
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [None]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]

In [None]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending=False)[:10]

In [None]:
C = movies_df['vote_average'].mean()
m=movies_df['vote_count'].quantile(0.6)
print('C:', round(C,3), 'm:', round(m,3))

In [None]:
percentile = 0.6

m=movies['vote_count'].quantile(percentile)
C=movies['vote_average'].mean()

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return( (v/(v+m)) * R) + ((m/(m+v)) * C)


movies['weighted_vote'] = movies.apply(weighted_vote_average, axis=1)

In [None]:
movies_df=[['title', 'vote_average', 'weighted_vote', 'vote_count']].sort_values('weighted_vote', ascending=False)[:10]

In [None]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title']] == title_name]
    title_index = title_movie.index.values
    
    similar_indexes = sorted_ind[title_index, :(top_n*2)]
    similar_indexes = similar_index.reshape(-1)
    
    similar_index = similar_indexes[similar_indexes != title_index]
    
    return df.iloc[similar_index].sort_values('weighted_vote', ascending=False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The GodFather', 10)
similar_movies[['title', 'vote_average', 'weighted_vote']]


# 감독/배우 기반 콘텐츠 필터링