# Data Prepare

In [1]:
from collections import Counter
from itertools import combinations
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns

import pickle as pk

from jupyterthemes import jtplot
jtplot.style(figsize=(15, 9))

## Data

In [10]:
movies = pd.read_csv('data/movies_dataset_CLEAN_noNaN.csv', index_col='id')
movies.head()

Unnamed: 0_level_0,genre_ids,overview,popularity,release_date,title,vote_average,vote_count,genres,cast,crew,keywords,label,label_encoder
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
385687,"[28, 80, 53]",Over many missions and against impossible odds...,4654.279,2023-05-17,Fast X,7.3,2093,"Action, Crime, Thriller","[{'cast_id': 12835, 'name': 'Vin Diesel', 'cha...","[{'crew_id': 1302, 'name': 'Susie Figgis', 'de...","[{'id': 9663, 'name': 'sequel'}, {'id': 9748, ...",Thriller,16
697843,"[28, 53]",Tasked with extracting a family who is at the ...,2494.01,2023-06-09,Extraction 2,7.7,910,"Action, Thriller","[{'cast_id': 74568, 'name': 'Chris Hemsworth',...","[{'crew_id': 950, 'name': 'Pietro Scalia', 'de...","[{'id': 3070, 'name': 'mercenary'}, {'id': 966...",Thriller,16
603692,"[28, 53, 80]","With the price on his head ever increasing, Jo...",1920.127,2023-03-22,John Wick: Chapter 4,7.9,3344,"Action, Thriller, Crime","[{'cast_id': 6384, 'name': 'Keanu Reeves', 'ch...","[{'crew_id': 3615, 'name': 'Manfred Banach', '...","[{'id': 242, 'name': 'new york city'}, {'id': ...",Thriller,16
569094,"[28, 12, 16, 878]","After reuniting with Gwen Stacy, Brooklyn’s fu...",2013.795,2023-05-31,Spider-Man: Across the Spider-Verse,8.6,1796,"Action, Adventure, Animation, Science Fiction","[{'cast_id': 587506, 'name': 'Shameik Moore', ...","[{'crew_id': 7624, 'name': 'Stan Lee', 'depart...","[{'id': 2858, 'name': 'sacrifice'}, {'id': 328...",Animation,2
502356,"[16, 10751, 12, 14, 35]","While working underground to fix a water main,...",1539.037,2023-04-05,The Super Mario Bros. Movie,7.8,5165,"Animation, Family, Adventure, Fantasy, Comedy","[{'cast_id': 73457, 'name': 'Chris Pratt', 'ch...","[{'crew_id': 70851, 'name': 'Jack Black', 'dep...","[{'id': 282, 'name': 'video game'}, {'id': 690...",Animation,2


### DataFrame with lists

In [11]:
keywords = pd.read_csv('DRAFT/data/keywords_list.csv', index_col='id')
keywords.head()

Unnamed: 0_level_0,keywords_list
id,Unnamed: 1_level_1
385687,"['sequel', 'revenge', 'racing', 'family', 'cars']"
697843,"['mercenary', 'sequel', 'rescue mission', 'lon..."
603692,"['new york city', 'martial arts', 'hitman', 's..."
569094,"['sacrifice', 'villain', 'comic book', 'sequel..."
502356,"['video game', 'gorilla', 'plumber', 'magic mu..."


In [12]:
crew = pd.read_csv('DRAFT/data/crew_list.csv', index_col='id')
crew.head()

Unnamed: 0_level_0,crew_
id,Unnamed: 1_level_1
385687,"['Zach Dean', 'Amanda Lewis', 'Gary Scott Thom..."
697843,"['Mike Larocca', 'Stephen McFeely', 'Joe Russo..."
603692,"['Michael Finch', 'Louise Rosner-Meyer', 'Henn..."
569094,"['Griffin Johnston', 'Phil Lord', 'Robert Jonk..."
502356,"['David D. Au', 'Michael Jelenic', 'Matt Fogel..."


In [13]:
cast = pd.read_csv('DRAFT/data/cast_list.csv', index_col='id')
cast.head()

Unnamed: 0_level_0,cast_list
id,Unnamed: 1_level_1
385687,"['Vin Diesel', 'Michelle Rodriguez', 'Tyrese G..."
697843,"['Chris Hemsworth', 'Golshifteh Farahani', 'Ad..."
603692,"['Keanu Reeves', 'Donnie Yen', 'Bill Skarsgård..."
569094,"['Shameik Moore', 'Hailee Steinfeld', 'Brian T..."
502356,"['Chris Pratt', 'Anya Taylor-Joy', 'Charlie Da..."


### One-Hot genres

In [14]:
oh_genres = pd.read_csv('data/genres_vectors.csv', index_col='id')
oh_genres.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
697843,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
603692,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
569094,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
502356,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


In [15]:
keywords = keywords.loc[movies.index]
cast     = cast.loc[movies.index]
crew     = crew.loc[movies.index]

In [16]:
movies.shape, keywords.shape, crew.shape, cast.shape, oh_genres.shape

((9130, 13), (9130, 1), (9130, 1), (9130, 1), (9130, 19))

### Unique lists

In [17]:
with open('DRAFT/data/unique_genres_list.pickle', 'rb') as f:
    u_genres = pk.load(f)

u_genres

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [18]:
with open('DRAFT/data/unique_crew_list.pickle', 'rb') as f:
    u_crew = pk.load(f)

u_crew[:10]

['Harmon Kaslow',
 'Gene Patterson',
 'Shirel Kozak',
 'Paul Freiberger',
 'Alex Theurer',
 'David Fountain',
 'Jean Skinner',
 'Gerardo Gatica',
 'Rich Thorne',
 'Iddo Lampton Enochs Jr.']

In [19]:
with open('DRAFT/data/unique_cast_list.pickle', 'rb') as f:
    u_cast = pk.load(f)

u_cast[:10]

['Pat Thomson',
 'John Matuszak',
 'Gustave Tassell',
 'Nathalie Richard',
 'Donald Petersen',
 'George DelHoyo',
 'Zane Cassidy',
 'Mikey Madison',
 'Donald MacBride',
 'Holly Horne']

In [20]:
with open('DRAFT/data/unique_keywords_list.pickle', 'rb') as f:
    u_keywords = pk.load(f)

u_keywords[:10]

['car crash',
 'liposuction',
 'stasis',
 'giant vegetable',
 'monster',
 'osama bin laden',
 'yonkers, new york',
 'jazz singer or musician',
 'chemical plant',
 'honesty']

## Vectors

### Simple Vectors

In [21]:
def get_vectors(df: pd.DataFrame, unique_list: list)->pd.DataFrame:
    df_vectors = pd.DataFrame(0, index=unique_list, columns=u_genres)
    df_vectors.head()
    
    for _id in df.index:
        row = df.loc[_id].to_list()[0][2: -2].split("', '")
        genres_vector = oh_genres.loc[_id].to_list()
        for item in row:
            df_vectors.loc[item] += genres_vector
    
    return df_vectors

In [22]:
keywords_vectors = get_vectors(keywords, u_keywords)
keywords_vectors.head()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
car crash,20,10,1,16,11,0,15,4,5,0,8,1,6,3,9,1,25,0,0
liposuction,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
stasis,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
giant vegetable,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
monster,48,43,22,41,0,0,17,27,50,0,71,1,12,0,55,8,29,1,0


In [24]:
keywords_vectors.to_csv('data/keywords_vectors.csv')

In [25]:
crew_vectors = get_vectors(crew, u_crew)
crew_vectors.head()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
Harmon Kaslow,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
Gene Patterson,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
Shirel Kozak,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
Paul Freiberger,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Alex Theurer,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0


In [26]:
crew_vectors.to_csv('data/crew_vectors.csv')

In [27]:
cast_vectors = get_vectors(cast, u_cast)
cast_vectors.head()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
Pat Thomson,0,0,0,1,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0
John Matuszak,0,2,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Gustave Tassell,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
Nathalie Richard,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0
Donald Petersen,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0


In [28]:
cast_vectors.to_csv('data/cast_vectors.csv')

In [36]:
cast

Unnamed: 0_level_0,cast_list
id,Unnamed: 1_level_1
385687,"['Vin Diesel', 'Michelle Rodriguez', 'Tyrese G..."
697843,"['Chris Hemsworth', 'Golshifteh Farahani', 'Ad..."
603692,"['Keanu Reeves', 'Donnie Yen', 'Bill Skarsgård..."
569094,"['Shameik Moore', 'Hailee Steinfeld', 'Brian T..."
502356,"['Chris Pratt', 'Anya Taylor-Joy', 'Charlie Da..."
...,...
15017,"['Jessica Simpson', 'Luke Wilson', 'Rachael Le..."
79509,"['Eric Roberts', 'Megan Gallagher', 'James Ear..."
13370,"['Bob Balaban', 'Christopher Guest', 'John Mic..."
480623,"['Bridget Regan', 'Travis Van Winkle', 'Raven ..."


### Sum of Vectors

In [47]:
def get_sum_of_vectors(df: pd.DataFrame, df_vectors: pd.DataFrame, unique_list: list)->pd.DataFrame:
    df_vectors_sum = pd.DataFrame(index=df.index, columns=u_genres)
    
    def get_sum(row):
        _id = row.name
        _ids = df_vectors.loc[df.loc[_id].to_list()[0][2:-2].split("', '")]
        
        return _ids.sum()

    df_vectors_sum = df.apply(get_sum, axis=1)
    
    return df_vectors_sum
    

In [48]:
keywords_sum_vectors = get_sum_of_vectors(keywords, keywords_vectors, u_keywords)
keywords_sum_vectors.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,395,243,104,302,178,2,258,184,123,17,225,14,73,88,140,23,353,11,27
697843,266,188,69,168,75,1,85,105,81,6,149,9,43,27,118,16,194,16,2
603692,678,389,143,534,432,7,475,209,197,27,216,37,156,178,239,20,562,15,8
569094,863,574,337,367,165,9,144,252,308,3,224,16,95,47,522,41,279,10,8
502356,314,346,207,519,81,4,171,262,180,20,88,38,44,102,163,3,135,12,11


In [56]:
keywords_sum_vectors.to_csv('data/keywords_sum_vectors.csv')

In [50]:
crew_sum_vectors = get_sum_of_vectors(crew, crew_vectors, u_crew)
crew_sum_vectors.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,234,116,2,81,123,1,115,34,61,20,34,6,21,40,61,0,165,9,1
697843,153,85,3,35,33,0,52,16,33,3,14,0,11,14,67,0,88,2,1
603692,178,80,5,81,89,1,83,5,32,14,35,2,36,30,59,0,163,14,2
569094,295,269,202,122,25,2,59,94,102,10,25,7,16,16,202,7,53,4,1
502356,15,71,104,97,4,0,3,96,48,0,0,6,1,0,5,1,0,0,0


In [54]:
crew_sum_vectors.to_csv('data/crew_sum_vectors.csv')

In [52]:
cast_sum_vectors = get_sum_of_vectors(cast, cast_vectors, u_cast)
cast_sum_vectors.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,284,137,31,140,174,6,135,52,54,13,21,5,24,42,76,5,194,12,1
697843,119,57,4,33,27,1,34,7,27,3,8,0,9,4,30,1,72,3,1
603692,176,81,26,75,88,3,113,29,45,12,28,5,25,23,53,7,136,7,6
569094,369,435,406,402,77,6,243,245,160,29,30,33,48,92,257,9,116,12,14
502356,193,281,442,429,31,3,69,372,175,9,31,29,45,34,130,34,49,1,3


In [55]:
cast_sum_vectors.to_csv('data/cast_sum_vectors.csv')