# 장르/키워드 데이터 가공

In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

movies = pd.read_csv('DataSet/movies_metadata/movies_metadata.csv', low_memory=False)
print(movies.shape)
movies.head(1)

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [2]:
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'overview']]

In [3]:
pd.set_option('max_colwidth', 100)
movies_df[['genres']][:1]

Unnamed: 0,genres
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"


In [4]:
import json

with open('Dataset/keywords.json', encoding='utf-8') as f:
  js = json.loads(f.read())
keywords_df = pd.DataFrame(js)


In [5]:
pd.set_option('max_colwidth', 100)
keywords_df[['keywords']][:1]

Unnamed: 0,keywords
0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id..."


In [6]:
movies_df.id = pd.to_numeric(movies_df.id, errors='coerce')
keywords_df.id = pd.to_numeric(keywords_df.id, errors='coerce')

In [7]:
movies_df = pd.merge(movies_df, keywords_df, on = 'id', how = 'inner')

movies_df.head()

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,overview,keywords
0,862.0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",7.7,5415.0,21.946943,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear o...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id..."
1,8844.0,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]",6.9,2413.0,17.015539,When siblings Judy and Peter discover an enchanted board game that opens the door to a magical w...,"[{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'nam..."
2,15602.0,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]",6.5,92.0,11.7129,A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John...,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': '..."
3,31357.0,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",6.1,34.0,3.859495,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusi...","[{'id': 818, 'name': 'based on novel'}, {'id': 10131, 'name': 'interracial relationship'}, {'id'..."
4,11862.0,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",5.7,173.0,8.387519,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she'...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'name': 'midlife crisis'}, {'id': 2246, 'name': 'con..."


In [8]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46482 entries, 0 to 46481
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            46482 non-null  float64
 1   title         46478 non-null  object 
 2   genres        46482 non-null  object 
 3   vote_average  46478 non-null  float64
 4   vote_count    46478 non-null  float64
 5   popularity    46478 non-null  object 
 6   overview      45487 non-null  object 
 7   keywords      46482 non-null  object 
dtypes: float64(3), object(5)
memory usage: 3.2+ MB


In [9]:
movies_df

Unnamed: 0,id,title,genres,vote_average,vote_count,popularity,overview,keywords
0,862.0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",7.7,5415.0,21.946943,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear o...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id..."
1,8844.0,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]",6.9,2413.0,17.015539,When siblings Judy and Peter discover an enchanted board game that opens the door to a magical w...,"[{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'nam..."
2,15602.0,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]",6.5,92.0,11.7129,A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John...,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': '..."
3,31357.0,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",6.1,34.0,3.859495,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusi...","[{'id': 818, 'name': 'based on novel'}, {'id': 10131, 'name': 'interracial relationship'}, {'id'..."
4,11862.0,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",5.7,173.0,8.387519,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she'...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'name': 'midlife crisis'}, {'id': 2246, 'name': 'con..."
...,...,...,...,...,...,...,...,...
46477,439050.0,Subdue,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}]",4.0,1.0,0.072051,Rising and falling between a man and woman.,"[{'id': 10703, 'name': 'tragic love'}]"
46478,111109.0,Century of Birthing,"[{'id': 18, 'name': 'Drama'}]",9.0,3.0,0.178241,An artist struggles to finish his work while a storyline about a cult plays in his head.,"[{'id': 2679, 'name': 'artist'}, {'id': 14531, 'name': 'play'}, {'id': 215397, 'name': 'pinoy'}]"
46479,67758.0,Betrayal,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}]",3.8,6.0,0.903007,"When one of her hits goes wrong, a professional assassin ends up with a suitcase full of a milli...",[]
46480,227506.0,Satan Triumphant,[],0.0,0.0,0.003503,"In a small town live two brothers, one a minister and the other one a hunchback painter of the c...",[]


In [10]:
movies_df['keywords'].isnull().sum()

0

In [11]:
type(movies_df['keywords'])

pandas.core.series.Series

In [12]:
from ast import literal_eval
import ast

In [13]:
movies_df['genres']=movies_df['genres'].apply(literal_eval)

In [14]:
movies_df['keywords'].apply(literal_eval)

0        [{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id...
1        [{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'nam...
2        [{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': '...
3        [{'id': 818, 'name': 'based on novel'}, {'id': 10131, 'name': 'interracial relationship'}, {'id'...
4        [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'name': 'midlife crisis'}, {'id': 2246, 'name': 'con...
                                                        ...                                                 
46477                                                                 [{'id': 10703, 'name': 'tragic love'}]
46478       [{'id': 2679, 'name': 'artist'}, {'id': 14531, 'name': 'play'}, {'id': 215397, 'name': 'pinoy'}]
46479                                                                                                     []
46480              

In [15]:
movies_df['keywords']=movies_df['keywords'].apply(literal_eval)

In [16]:
movies_df['genres']=movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords']=movies_df['keywords'].apply(lambda x : [y['name'] for y in x])
movies_df[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, rivalry, boy next door, new toy, toy comes to life]"


# 장르 콘텐츠 유사도 측정

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
#movies_df['genres_literal']=movies_df['genres']

In [22]:
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat=count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

(46482, 401)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:1])

In [None]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title']] == title_name]
    
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]
    
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]

In [None]:
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]

In [None]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending=False)[:10]

In [None]:
C = movies_df['vote_average'].mean()
m=movies_df['vote_count'].quantile(0.6)
print('C:', round(C,3), 'm:', round(m,3))

In [None]:
percentile = 0.6

m=movies['vote_count'].quantile(percentile)
C=movies['vote_average'].mean()

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return( (v/(v+m)) * R) + ((m/(m+v)) * C)


movies['weighted_vote'] = movies.apply(weighted_vote_average, axis=1)

In [None]:
movies_df=[['title', 'vote_average', 'weighted_vote', 'vote_count']].sort_values('weighted_vote', ascending=False)[:10]

In [None]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    title_movie = df[df['title']] == title_name]
    title_index = title_movie.index.values
    
    similar_indexes = sorted_ind[title_index, :(top_n*2)]
    similar_indexes = similar_index.reshape(-1)
    
    similar_index = similar_indexes[similar_indexes != title_index]
    
    return df.iloc[similar_index].sort_values('weighted_vote', ascending=False)[:top_n]

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The GodFather', 10)
similar_movies[['title', 'vote_average', 'weighted_vote']]


# 감독/배우 기반 콘텐츠 필터링