# Data Prepare

- *unique*: cast, crew, keywords
- *vectors*: cast, crew, keywords
- *vectors_sum*: cast, crew, keywords

In [1]:
from collections import Counter
from itertools import combinations
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns

import pickle as pk

from jupyterthemes import jtplot
jtplot.style(figsize=(15, 9))

## Data

In [2]:
movies = pd.read_csv('data/movies_dataset_CLEAN_noNaN.csv', index_col='id')
movies.head()

Unnamed: 0_level_0,genre_ids,overview,popularity,release_date,title,vote_average,vote_count,genres,cast,crew,keywords
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
385687,"[28, 80, 53]",Over many missions and against impossible odds...,4654.279,2023-05-17,Fast X,7.3,2093,"Action, Crime, Thriller","[{'cast_id': 12835, 'name': 'Vin Diesel', 'cha...","[{'crew_id': 1302, 'name': 'Susie Figgis', 'de...","[{'id': 9663, 'name': 'sequel'}, {'id': 9748, ..."
697843,"[28, 53]",Tasked with extracting a family who is at the ...,2494.01,2023-06-09,Extraction 2,7.7,910,"Action, Thriller","[{'cast_id': 74568, 'name': 'Chris Hemsworth',...","[{'crew_id': 950, 'name': 'Pietro Scalia', 'de...","[{'id': 3070, 'name': 'mercenary'}, {'id': 966..."
603692,"[28, 53, 80]","With the price on his head ever increasing, Jo...",1920.127,2023-03-22,John Wick: Chapter 4,7.9,3344,"Action, Thriller, Crime","[{'cast_id': 6384, 'name': 'Keanu Reeves', 'ch...","[{'crew_id': 3615, 'name': 'Manfred Banach', '...","[{'id': 242, 'name': 'new york city'}, {'id': ..."
569094,"[28, 12, 16, 878]","After reuniting with Gwen Stacy, Brooklyn’s fu...",2013.795,2023-05-31,Spider-Man: Across the Spider-Verse,8.6,1796,"Action, Adventure, Animation, Science Fiction","[{'cast_id': 587506, 'name': 'Shameik Moore', ...","[{'crew_id': 7624, 'name': 'Stan Lee', 'depart...","[{'id': 2858, 'name': 'sacrifice'}, {'id': 328..."
502356,"[16, 10751, 12, 14, 35]","While working underground to fix a water main,...",1539.037,2023-04-05,The Super Mario Bros. Movie,7.8,5165,"Animation, Family, Adventure, Fantasy, Comedy","[{'cast_id': 73457, 'name': 'Chris Pratt', 'ch...","[{'crew_id': 70851, 'name': 'Jack Black', 'dep...","[{'id': 282, 'name': 'video game'}, {'id': 690..."


### DataFrame with lists

In [3]:
keywords = pd.read_csv('DRAFT/data/keywords_list.csv', index_col='id')
keywords.head()

Unnamed: 0_level_0,keywords_list
id,Unnamed: 1_level_1
385687,"['sequel', 'revenge', 'racing', 'family', 'cars']"
697843,"['mercenary', 'sequel', 'rescue mission', 'lon..."
603692,"['new york city', 'martial arts', 'hitman', 's..."
569094,"['sacrifice', 'villain', 'comic book', 'sequel..."
502356,"['video game', 'gorilla', 'plumber', 'magic mu..."


In [4]:
crew = pd.read_csv('DRAFT/data/crew_list.csv', index_col='id')
crew.head()

Unnamed: 0_level_0,crew_
id,Unnamed: 1_level_1
385687,"['Zach Dean', 'Amanda Lewis', 'Gary Scott Thom..."
697843,"['Mike Larocca', 'Stephen McFeely', 'Joe Russo..."
603692,"['Michael Finch', 'Louise Rosner-Meyer', 'Henn..."
569094,"['Griffin Johnston', 'Phil Lord', 'Robert Jonk..."
502356,"['David D. Au', 'Michael Jelenic', 'Matt Fogel..."


In [5]:
cast = pd.read_csv('DRAFT/data/cast_list.csv', index_col='id')
cast.head()

Unnamed: 0_level_0,cast_list
id,Unnamed: 1_level_1
385687,"['Vin Diesel', 'Michelle Rodriguez', 'Tyrese G..."
697843,"['Chris Hemsworth', 'Golshifteh Farahani', 'Ad..."
603692,"['Keanu Reeves', 'Donnie Yen', 'Bill Skarsgård..."
569094,"['Shameik Moore', 'Hailee Steinfeld', 'Brian T..."
502356,"['Chris Pratt', 'Anya Taylor-Joy', 'Charlie Da..."


### One-Hot genres

In [20]:
oh_genres = pd.read_csv('data/genres_vectors.csv', index_col='id')
oh_genres.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
697843,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
603692,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
569094,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
502356,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


In [6]:
keywords = keywords.loc[movies.index]
cast     = cast.loc[movies.index]
crew     = crew.loc[movies.index]

In [21]:
movies.shape, keywords.shape, crew.shape, cast.shape, oh_genres.shape

((9130, 11), (9130, 1), (9130, 1), (9130, 1), (9130, 19))

### Unique lists

In [8]:
with open('DRAFT/data/unique_genres_list.pickle', 'rb') as f:
    u_genres = pk.load(f)

u_genres

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [26]:
with open('DRAFT/data/unique_crew_list.pickle', 'rb') as f:
    u_crew = pk.load(f)

u_crew[:10]

['Harmon Kaslow',
 'Gene Patterson',
 'Shirel Kozak',
 'Paul Freiberger',
 'Alex Theurer',
 'David Fountain',
 'Jean Skinner',
 'Gerardo Gatica',
 'Rich Thorne',
 'Iddo Lampton Enochs Jr.']

In [29]:
with open('DRAFT/data/unique_cast_list.pickle', 'rb') as f:
    u_cast = pk.load(f)

u_cast[:10]

['Pat Thomson',
 'John Matuszak',
 'Gustave Tassell',
 'Nathalie Richard',
 'Donald Petersen',
 'George DelHoyo',
 'Zane Cassidy',
 'Mikey Madison',
 'Donald MacBride',
 'Holly Horne']

In [30]:
with open('DRAFT/data/unique_keywords_list.pickle', 'rb') as f:
    u_keywords = pk.load(f)

u_keywords[:10]

['car crash',
 'liposuction',
 'stasis',
 'giant vegetable',
 'monster',
 'osama bin laden',
 'yonkers, new york',
 'jazz singer or musician',
 'chemical plant',
 'honesty']

## Vectors

In [22]:
def get_vectors(df: pd.DataFrame, unique_list: list)->pd.DataFrame:
    df_vectors = pd.DataFrame(0, index=unique_list, columns=u_genres)
    df_vectors.head()
    
    for _id in df.index:
        row = df.loc[_id].to_list()[0][2: -2].split("', '")
        genres_vector = oh_genres.loc[_id].to_list()
        for item in row:
            df_vectors.loc[item] += genres_vector
    
    return df_vectors

In [10]:
def get_sum_of_vectors(df: pd.DataFrame)->pd.DataFrame:
    pass