<img src='logo.png'>

<font size=6><b>kaggle The Movies Dataset</b></font>
* Metadata on over 45,000 movies. 26 million ratings from over 270,000 users.
* ref : https://www.kaggle.com/rounakbanik/the-movies-dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel 
from ast import literal_eval

import warnings 
warnings.simplefilter('ignore')

from IPython.core.display import display, HTML
display(HTML("<style>.container{width:100% !important;}</style>"))
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', None)


# Data Load
<pre>
genres : 영화 장르
keywords : 영화의 키워드
original_language : 영화 언어
title : 제목
vote_average : 평점 평균
vote_count : 평점 카운트
popularity : 인기도
overview : 개요 설명
</pre>

In [2]:
mdf = pd.read_csv("./dataset/movies_metadata_2.csv")

In [3]:
print(mdf.shape)
print(mdf.info())
mdf.head(1)

(45466, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  obj

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995


## id 중 오데이터

In [4]:
idx = mdf[mdf['id'].str.len()>6].index
mdf = mdf.drop(idx, axis=0)
mdf = mdf.reset_index(drop=True)

In [5]:
mdf['id'] = mdf['id'].astype('int')

# Weighted Rarking  (Top %)
* vote, average

* url : https://www.quora.com/How-does-IMDbs-rating-system-work<br>
<img src = 'imdb_score.png' width=600>

## WR 구하기

* WR = (v / (v+m)) * R + (m/ (v+m)) *C
* R : 영화의 평점
* v : 영화의 투표수
* m : 상위 %안에 들어야 하는 최소 투표수
* C : 개별 영화의 평점

In [6]:
C = mdf['vote_average'].mean()
C

5.618207215134184

In [7]:
m = mdf['vote_count'].quantile(0.95)
m

434.0

In [8]:
def my_calc_wr(mdf):
    R = mdf['vote_average']
    v = mdf['vote_count']
    WR = (v / (v+m)) * R + (m/ (v+m)) *C
    return WR

In [9]:
mdf['wr'] = mdf.apply(my_calc_wr, axis=1)

In [10]:
mdf.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,wr
0,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,7.545529


## 상위 % 데이터만 가져오기

In [11]:
df5 = mdf[mdf['vote_count'] > 434.0][[ 'id','title', 'genres', 'vote_average', 'vote_count', 'year', 'wr' ]]
df5.shape

(2268, 7)

In [12]:
df5.head(2)

Unnamed: 0,id,title,genres,vote_average,vote_count,year,wr
0,862,Toy Story,"['Animation', 'Comedy', 'Family']",7.7,5415.0,1995,7.545529
1,8844,Jumanji,"['Adventure', 'Fantasy', 'Family']",6.9,2413.0,1995,6.704602


In [13]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2268 entries, 0 to 45011
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            2268 non-null   int32  
 1   title         2268 non-null   object 
 2   genres        2268 non-null   object 
 3   vote_average  2268 non-null   float64
 4   vote_count    2268 non-null   float64
 5   year          2268 non-null   object 
 6   wr            2268 non-null   float64
dtypes: float64(3), int32(1), object(3)
memory usage: 132.9+ KB


## 장르 검색

In [14]:
df5['genres'].head()

0            ['Animation', 'Comedy', 'Family']
1           ['Adventure', 'Fantasy', 'Family']
5     ['Action', 'Crime', 'Drama', 'Thriller']
9          ['Adventure', 'Action', 'Thriller']
15                          ['Drama', 'Crime']
Name: genres, dtype: object

In [15]:
df5.loc[0, 'genres'],  type(df5.loc[0, 'genres'])

("['Animation', 'Comedy', 'Family']", str)

In [16]:
'c' in 'AAA'

False

In [17]:
df5[df5['genres'].str.contains('Family')].head()

Unnamed: 0,id,title,genres,vote_average,vote_count,year,wr
0,862,Toy Story,"['Animation', 'Comedy', 'Family']",7.7,5415.0,1995,7.545529
1,8844,Jumanji,"['Adventure', 'Fantasy', 'Family']",6.9,2413.0,1995,6.704602
33,9598,Babe,"['Fantasy', 'Drama', 'Comedy', 'Family']",6.0,756.0,1995,5.860758
47,10530,Pocahontas,"['Adventure', 'Animation', 'Drama', 'Family']",6.7,1509.0,1995,6.458364
155,8839,Casper,"['Fantasy', 'Comedy', 'Family']",6.0,1045.0,1995,5.887966


## 최종 코드

In [18]:
def my_calc_wr_def(mdf):
    R = mdf['vote_average']
    v = mdf['vote_count']
    WR = (v / (v+m)) * R + (m/ (v+m)) *C
    return WR

mdf = pd.read_csv("./dataset/movies_metadata_2.csv")
idx = mdf[mdf['id'].str.len()>6].index
mdf = mdf.drop(idx, axis=0)
mdf = mdf.reset_index(drop=True)
mdf['id'] = mdf['id'].astype('int')

In [19]:
def my_search_wr_by_genres(search_genres ='Family', percnet=0.95):
    C = mdf['vote_average'].mean()
    m = mdf['vote_count'].quantile(percnet)
    mdf['wr'] = mdf.apply(my_calc_wr_def, axis=1)

    df5 = mdf[mdf['vote_count'] > m][[ 'id','title', 'genres', 'vote_average', 'vote_count', 'year', 'wr' ]]
    df5 = df5.sort_values('wr', ascending=False)
    return df5[df5['genres'].str.contains(search_genres)]

In [20]:
resdf = my_search_wr_by_genres('Fantasy', 0.97)
resdf.head()

Unnamed: 0,id,title,genres,vote_average,vote_count,year,wr
5481,129,Spirited Away,"['Fantasy', 'Adventure', 'Animation', 'Family']",8.3,3968.0,2001,8.035598
7000,122,The Lord of the Rings: The Return of the King,"['Adventure', 'Fantasy', 'Action']",8.1,8226.0,2003,7.975624
3030,497,The Green Mile,"['Fantasy', 'Drama', 'Crime']",8.2,4166.0,1999,7.956413
4863,120,The Lord of the Rings: The Fellowship of the Ring,"['Adventure', 'Fantasy', 'Action']",8.0,8892.0,2001,7.88916
5814,121,The Lord of the Rings: The Two Towers,"['Adventure', 'Fantasy', 'Action']",8.0,7641.0,2002,7.871988


# Review  based

## Data Load

In [None]:
ldf = pd.read_csv("./dataset/links_small.csv")

In [None]:
print(ldf.shape)
print(ldf.info())
ldf.head(2)

## ldf 결측처리 

In [None]:
ldf.isna().sum()

In [None]:
ldf = ldf.dropna(axis=0)
ldf = ldf.reset_index(drop=True)
print(ldf.shape)

In [None]:
ldf['tmdbId'] = ldf['tmdbId'].astype('int')

## mdf +  ldf join

In [None]:
mldf = pd.merge(mdf, ldf,  left_on="id", right_on='tmdbId', how="inner")
mldf.head(1)   # 추가 컬럼 movieId	imdbId	tmdbId

## mldf 결측처리

In [None]:
mldf['tagline'] = mldf['tagline'].fillna('')

In [None]:
mldf['overview'] = mldf['overview'].fillna('')

In [None]:
mldf[['overview', 'tagline']].tail(1)

In [None]:
mldf['view_tag'] = mldf['overview'] + mldf['tagline']

In [None]:
mldf['view_tag'].isna().sum()

In [None]:
idx = mldf[mldf['view_tag'].str.len() < 1].index
print(len(idx))
mldf = mldf.drop(idx, axis=0)
mldf = mldf.reset_index(drop=True)
print(mldf.shape)

## Tf-Idf
* from sklearn.feature_extraction.text import TfidfVectorizer
* from sklearn.feature_extraction.text import CountVectorizer
* from sklearn.metrics.pairwise import cosine_similarity
* from sklearn.metrics.pairwise import linear_kernel 

class sklearn.feature_extraction.text.<font color=red><b>TfidfVectorizer</b></font>(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]¶

In [None]:
tfidf = TfidfVectorizer( stop_words='english') #, max_df=0.8, min_df=0.2)  ngram_range=(1, 2)
tfidf_matrix = tfidf.fit_transform(mldf['view_tag'])
# print(tfidf.vocabulary_)
print(tfidf_matrix.shape)

In [None]:
# tfidf_matrix[20].toarray()

## 유사도 계산

In [None]:
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cos_sim[0][:20]

## 영화 제목 색인 시리즈 생성
* 제목을 입력하면 (idx)번째 출력

In [None]:
s  = mldf['title']
title_s = pd.Series(s.index, index=s.values)  # 값 <--> 인덱스 서로 자리 변경
title_s.head(10)

## review 유사도 top-N 검색

In [None]:
def my_search_cossim_by_review(title = "Toy Story", topn=10):
    # 인덱스 출력하기
    s = mldf['title']
    title_s = pd.Series(s.index, index = s.values) # 값 <--> 인덱스 변경
    idx = title_s[title]
    idx_list = pd.Series(cos_sim[idx].reshape(-1)).sort_values(ascending = False).index[1:topn+1] # 0번재는 본인. 1~10번째
    title_list = mldf.loc[idx_list,'title'].values
    return title_list

In [None]:
mldf[mldf['title'].str.contains('Toy Story')][[ 'tagline', 'title']]

In [None]:
res = my_search_cossim_by_review('Toy Story',5)
res

## 최종코드

# Actor, Driect... based

credit.csv
* cast<br>
<pre>
[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'},
 {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}
]</pre>

* craw <br>
[{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', 'gender': 2, 'id': 7879, 'job': 'Director', 'name': 'John Lasseter', 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}, 
 {'credit_id': '52fe4284c3a36847f8024f4f', 'department': 'Writing', 'gender': 2, 'id': 12891, 'job': 'Screenplay', 'name': 'Joss Whedon', 'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},]

keywords.csv
* keywords<br>
<pre>
[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'},]


## Data Load

In [None]:
cdf = pd.read_csv("./dataset/credits.csv")
kdf = pd.read_csv("./dataset/keywords.csv")

In [None]:
print(cdf.shape)
print(cdf.info())
# cdf.head(1)

In [None]:
print(kdf.shape)
print(kdf.info())
kdf.head(1)

## mldf = mdf + ldf + cdf + kdf  JOIN

In [None]:
mldf = mldf.merge(cdf, on='id')
mldf = mldf.merge(kdf, on='id')  #cast	crew	keywords 추가

In [None]:
# mldf.head(1)

In [None]:
mldf.shape

## 가공
* <b>str --> list 객체타입으로 변경   :  .apply(literal_eval)</b>
<pre>
* cast  {name': 'Tom Hanks' }
[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'},

* craw  {'job': 'Director', 'name': 'John Lasseter'}
 [{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', 'gender': 2, 'id': 7879, 'job': 'Director', 'name': 'John Lasseter', 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}, {'credit_id': '52fe4284c3a36847f8024f4f', 'department': 'Writing', 'gender': 2, 'id': 12891, 'job': 'Screenplay', 'name': 'Joss Whedon', 'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},]

*  keywords {'name': 'jealousy'}
 [{'id': 931, 'name': 'jealousy'},]
 
 * genres

In [None]:
type(mldf.loc[0, 'keywords'])

In [None]:
mldf['cast']     = mldf['cast'].apply(literal_eval)      #배우
mldf['crew']     = mldf['crew'].apply(literal_eval)      #감독
mldf['keywords'] = mldf['keywords'].apply(literal_eval)  #대표키워드
mldf['genres']   = mldf['genres'].apply(literal_eval)    #장르

In [None]:
type(mldf.loc[0, 'keywords']),  mldf.loc[0, 'keywords'][:3]

## 파생변수 
* BoW = [감독] + [배우] + [장르] + [키워드]
* Tf-Idf : 유사도

###  감독 : mldf['director']

In [None]:
def my_get_director_def(s):       #[{'job': 'Director', 'name': 'John Lasseter'} , .... ]
    for dict in s:                #{'job': 'Director', 'name': 'John Lasseter'}
        if dict['job'] == 'Director':
            dict['name'] = dict['name'].replace(' ', '')
            return [dict['name'].lower()]  # [john lasseter]
    return np.nan

In [None]:
mldf['director'] = mldf['crew'].apply(my_get_director_def)

In [None]:
mldf['director'].head()

###  배우 : mldf['actor']

In [None]:
def my_get_name_def(s):      
    cast_list = []
    for dict in s:           
        dict['name'] = dict['name'].replace(' ', '')
        cast_list.append(dict['name'].lower())
    return cast_list[:3]

In [None]:
mldf['actor'] = mldf['cast'].apply(my_get_name_def)

In [None]:
mldf['actor'].head()

### 키워드 : mldf['key']

In [None]:
mldf['key'] = mldf['keywords'].apply(my_get_name_def)

In [None]:
mldf['key'].head()

###  장르

In [None]:
mldf['genres'].head()

### 합치기 : mldf['search4']

In [None]:
# mldf['search4'] =  mldf['director'] + mldf['actor'] + mldf['key'] + mldf['genres']
mldf['search4'] = mldf['director'] + mldf['actor'] + mldf['key'] + mldf['genres']
mldf['search4'].head()

---
<font size=4><b> review  유사도와 이하 상동

---

## 유사도 

In [None]:
type(mldf.loc[0, 'search4'])

In [None]:
mldf['search4'] = mldf['search4'].astype('str')

In [None]:
tfidf = CountVectorizer() #, max_df=0.8, min_df=0.2)  ngram_range=(1, 2)
matrix = tfidf.fit_transform(mldf['search4'])
print(matrix.shape)

cos_sim = cosine_similarity(matrix, matrix)
cos_sim[0][:20]


## 영화 제목 색인 시리즈 생성
* 제목을 입력하면 (idx)번째 출력

In [None]:
s  = mldf['title']
title_s = pd.Series(s.index, index=s.values)  # 값 <--> 인덱스 서로 자리 변경
title_s.head(10)

## review 유사도 top-N 검색

In [None]:
def my_search_cossim_by_search4(title = "Toy Story", topn=10):
    # 인덱스 출력하기
    s =  mldf['title']
    title_s = pd.Series(s.index, index = s.values) # 값 <--> 인덱스 변경
    idx = title_s[title]
    print(idx)
    idx_list = pd.Series(cos_sim[idx].reshape(-1)).sort_values(ascending = False).index[1:topn+1] # 0번재는 본인. 1~10번째
    title_list = mldf.loc[idx_list,'title'].values
    return title_list

In [None]:
res = my_search_cossim_by_search4('Toy Story',5)   #Batman Forever
res

## 검증

In [None]:
mldf.loc[0, ['title','search4']]

In [None]:
for r in res :
    print(mldf[mldf['title']==r][['title','search4']].values)

## 최종코드