# Data Prepare

In [2]:
from collections import Counter
from itertools import combinations
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns

import pickle as pk

from jupyterthemes import jtplot
jtplot.style(figsize=(15, 9))

## Genres

### One-Hot Encoding

In [22]:
genres = pd.read_csv('../data/popular_10000_movies/movies_dataset_CLEAN.csv', index_col='id')

genres = genres['genres'].to_frame()
genres.dropna(inplace=True)

genres.head()

Unnamed: 0_level_0,genres
id,Unnamed: 1_level_1
385687,"Action, Crime, Thriller"
697843,"Action, Thriller"
603692,"Action, Thriller, Crime"
569094,"Action, Adventure, Animation, Science Fiction"
502356,"Animation, Family, Adventure, Fantasy, Comedy"


In [23]:
genres.shape

(9144, 1)

In [24]:
genres.isna().sum()

genres    0
dtype: int64

In [25]:
genres['genres'].nunique()

2131

In [26]:
unique_genres = [
    'Action',
    'Adventure',
    'Animation',
    'Comedy',
    'Crime',
    'Documentary',
    'Drama',
    'Family',
    'Fantasy',
    'History',
    'Horror',
    'Music',
    'Mystery',
    'Romance',
    'Science Fiction',
    'TV Movie',
    'Thriller',
    'War',
    'Western'
]

In [27]:
genres_vectors = pd.DataFrame(index=genres.index, columns=unique_genres)
genres_vectors.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,,,,,,,,,,,,,,,,,,,
697843,,,,,,,,,,,,,,,,,,,
603692,,,,,,,,,,,,,,,,,,,
569094,,,,,,,,,,,,,,,,,,,
502356,,,,,,,,,,,,,,,,,,,


In [28]:
for i in range(genres.shape[0]):
    df = np.full(len(unique_genres), 0)
    
    for j in str(genres['genres'].iloc[i]).split(', '):
        df[unique_genres.index(j)] = 1

    genres_vectors.iloc[i] = df
        
genres_vectors.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
697843,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
603692,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
569094,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
502356,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


### Labels for Genres

In [29]:
# if we have that genre in list it will be dominant, from most to least
labels = [
    'Documentary',
    'Music',
    'Animation',
    'Family',
    'Science Fiction',
    'Horror',
    'History',
    'Thriller',
]

In [30]:
labels_df = pd.DataFrame(index=genres.index, columns=['label'])
labels_df.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
385687,
697843,
603692,
569094,
502356,


In [31]:
for i in range(genres.shape[0]):
    genres_l = str(genres['genres'].iloc[i]).split(', ')
    labels_df.iloc[i] = genres_l[0]
    for j in labels:
        if j in genres_l:
            labels_df.iloc[i] = j
            break
        
labels_df.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
385687,Thriller
697843,Thriller
603692,Thriller
569094,Animation
502356,Animation


In [32]:
le = LabelEncoder()
le.fit(labels_df['label'])

labels_df['label_encoder'] = le.transform(labels_df['label'])
labels_df.head()

Unnamed: 0_level_0,label,label_encoder
id,Unnamed: 1_level_1,Unnamed: 2_level_1
385687,Thriller,16
697843,Thriller,16
603692,Thriller,16
569094,Animation,2
502356,Animation,2


In [33]:
labels_df.shape

(9144, 2)

### Add Labels to Genres

In [34]:
df = pd.merge(genres_vectors, labels_df, on='id', how='inner')
df.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,...,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,label,label_encoder
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
385687,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Thriller,16
697843,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Thriller,16
603692,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Thriller,16
569094,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,Animation,2
502356,0,1,1,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,Animation,2


In [35]:
df.shape

(9144, 21)

In [37]:
df.to_csv('data/one-hot_genres_w_labels.csv')

### Cosine Similarity

In [43]:
genres_list = list(genres['genres'].unique())

In [44]:
for i, g in enumerate(genres_list):
    genres_list[i] = g.split(', ')

In [45]:
genres_list[:20]

[['Action', 'Crime', 'Thriller'],
 ['Action', 'Thriller'],
 ['Action', 'Thriller', 'Crime'],
 ['Action', 'Adventure', 'Animation', 'Science Fiction'],
 ['Animation', 'Family', 'Adventure', 'Fantasy', 'Comedy'],
 ['Action', 'Adventure', 'Science Fiction'],
 ['Animation', 'Comedy', 'Family', 'Fantasy', 'Romance'],
 ['Mystery', 'Thriller', 'Science Fiction'],
 ['Science Fiction', 'Action', 'Adventure'],
 ['Action', 'Science Fiction', 'Adventure'],
 ['Science Fiction', 'Adventure', 'Action'],
 ['Action', 'Horror', 'Thriller'],
 ['Adventure', 'Family', 'Fantasy', 'Romance'],
 ['Thriller', 'Horror'],
 ['Horror', 'Thriller'],
 ['War', 'Action', 'Thriller'],
 ['Drama', 'Romance'],
 ['Action', 'Fantasy', 'Thriller'],
 ['Comedy', 'Action', 'Fantasy'],
 ['Comedy', 'Drama']]

In [46]:
un_genres_list = [genre for genres in genres_list for genre in genres]

In [47]:
len(genres_list), len(un_genres_list)

(2131, 7856)

In [48]:
un_genres_list = list(set(un_genres_list))

In [49]:
len(un_genres_list)

19

In [50]:
un_genres_list

['TV Movie',
 'Thriller',
 'Action',
 'Mystery',
 'Crime',
 'Documentary',
 'Adventure',
 'Science Fiction',
 'Drama',
 'Animation',
 'Music',
 'Fantasy',
 'Family',
 'Romance',
 'Western',
 'War',
 'History',
 'Horror',
 'Comedy']

In [51]:
genres_list[:20]

[['Action', 'Crime', 'Thriller'],
 ['Action', 'Thriller'],
 ['Action', 'Thriller', 'Crime'],
 ['Action', 'Adventure', 'Animation', 'Science Fiction'],
 ['Animation', 'Family', 'Adventure', 'Fantasy', 'Comedy'],
 ['Action', 'Adventure', 'Science Fiction'],
 ['Animation', 'Comedy', 'Family', 'Fantasy', 'Romance'],
 ['Mystery', 'Thriller', 'Science Fiction'],
 ['Science Fiction', 'Action', 'Adventure'],
 ['Action', 'Science Fiction', 'Adventure'],
 ['Science Fiction', 'Adventure', 'Action'],
 ['Action', 'Horror', 'Thriller'],
 ['Adventure', 'Family', 'Fantasy', 'Romance'],
 ['Thriller', 'Horror'],
 ['Horror', 'Thriller'],
 ['War', 'Action', 'Thriller'],
 ['Drama', 'Romance'],
 ['Action', 'Fantasy', 'Thriller'],
 ['Comedy', 'Action', 'Fantasy'],
 ['Comedy', 'Drama']]

In [52]:
genres_counter = {}
l = []
for g in un_genres_list:
    for gs in genres_list:
        if g in gs:
            l.extend(gs)
    genres_counter[g] = dict(Counter(l))
    l = []

genres_counter['Animation']

{'Action': 129,
 'Adventure': 216,
 'Animation': 407,
 'Science Fiction': 99,
 'Family': 298,
 'Fantasy': 160,
 'Comedy': 215,
 'Romance': 26,
 'Music': 38,
 'Drama': 31,
 'TV Movie': 36,
 'Crime': 14,
 'Thriller': 12,
 'Horror': 18,
 'Mystery': 40,
 'Western': 7,
 'War': 5,
 'History': 4}

In [53]:
genres_df = pd.DataFrame(columns=un_genres_list, index=un_genres_list)
genres_df.head()

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,,,,,,,,,,,,,,,,,,,
Thriller,,,,,,,,,,,,,,,,,,,
Action,,,,,,,,,,,,,,,,,,,
Mystery,,,,,,,,,,,,,,,,,,,
Crime,,,,,,,,,,,,,,,,,,,


In [54]:
for genre in un_genres_list:
    g = {}
    for u_genre in un_genres_list:
        if u_genre in genres_counter[genre].keys():
            g[u_genre] =  genres_counter[genre][u_genre]
        else:
            g[u_genre] = 0
    genres_df.loc[genre] = g

In [55]:
genres_df.head()

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,176,25,38,12,10,3,44,39,55,36,23,46,77,22,3,5,10,20,79
Thriller,25,615,264,196,197,1,111,154,285,12,4,73,8,72,19,38,38,159,98
Action,38,264,759,73,153,5,348,234,208,129,8,152,137,52,38,66,50,115,210
Mystery,12,196,73,342,98,3,59,72,141,40,5,52,41,36,6,6,9,84,85
Crime,10,197,153,98,348,3,43,25,166,14,9,15,15,39,13,2,19,36,116


In [56]:
cs_df = pd.DataFrame(columns=genres_df.index, index=genres_df.index)
cs_df.head()

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,,,,,,,,,,,,,,,,,,,
Thriller,,,,,,,,,,,,,,,,,,,
Action,,,,,,,,,,,,,,,,,,,
Mystery,,,,,,,,,,,,,,,,,,,
Crime,,,,,,,,,,,,,,,,,,,


In [69]:
def get_cosine_similarity_mertric(df1, df2):
    return np.dot(df1, df2)/ \
    (np.sqrt(np.power(df1, 2).sum()) * np.sqrt(np.power(df2, 2).sum()))

In [70]:
for i in genres_df.columns:
    for j in genres_df.columns:
        cs_df.loc[i][j] = get_cosine_similarity_mertric(genres_df[i], genres_df[j])

In [71]:
cs_df

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,1.0,0.394733,0.513236,0.39673,0.364897,0.273702,0.592527,0.53523,0.511371,0.599006,0.580721,0.61596,0.676505,0.495483,0.354297,0.326719,0.332524,0.410583,0.677208
Thriller,0.394733,1.0,0.717784,0.803086,0.803562,0.252943,0.478655,0.632004,0.747659,0.276127,0.26195,0.419476,0.280956,0.495854,0.468874,0.527351,0.504018,0.717057,0.440748
Action,0.513236,0.717784,1.0,0.554684,0.650536,0.291962,0.797106,0.781199,0.623967,0.589238,0.364484,0.625936,0.571553,0.473524,0.555539,0.573059,0.479258,0.612385,0.641312
Mystery,0.39673,0.803086,0.554684,1.0,0.702164,0.263122,0.456686,0.554591,0.662259,0.365163,0.314863,0.449878,0.36879,0.471931,0.387748,0.379926,0.384377,0.646352,0.491783
Crime,0.364897,0.803562,0.650536,0.702164,1.0,0.289151,0.422055,0.450467,0.70113,0.281047,0.310511,0.34936,0.297321,0.481107,0.460968,0.420063,0.462907,0.507797,0.503363
Documentary,0.273702,0.252943,0.291962,0.263122,0.289151,1.0,0.247916,0.216827,0.337446,0.169529,0.257017,0.192158,0.198267,0.221919,0.214639,0.240606,0.312933,0.220365,0.274301
Adventure,0.592527,0.478655,0.797106,0.456686,0.422055,0.247916,1.0,0.719617,0.537524,0.801408,0.530574,0.794288,0.831916,0.530338,0.53405,0.451217,0.386237,0.464983,0.784823
Science Fiction,0.53523,0.632004,0.781199,0.554591,0.450467,0.216827,0.719617,1.0,0.512783,0.598825,0.354121,0.613756,0.575048,0.428716,0.385528,0.382743,0.295121,0.641385,0.601664
Drama,0.511371,0.747659,0.623967,0.662259,0.70113,0.337446,0.537524,0.512783,1.0,0.351065,0.497333,0.496137,0.432878,0.725442,0.603654,0.706889,0.748685,0.516213,0.542533
Animation,0.599006,0.276127,0.589238,0.365163,0.281047,0.169529,0.801408,0.598825,0.351065,1.0,0.590689,0.775147,0.926371,0.421453,0.339857,0.239284,0.19883,0.339585,0.789651


In [72]:
cs_df.to_csv('data/genres_cosine_similarity.csv')

---

## Keywords

In [2]:
keywords = pd.read_csv('../data/popular_10000_movies/movies_dataset_CLEAN.csv', index_col='id')
keywords = keywords['keywords'].to_frame()

In [3]:
keywords.head()

Unnamed: 0_level_0,keywords
id,Unnamed: 1_level_1
385687,"[{'id': 9663, 'name': 'sequel'}, {'id': 9748, ..."
697843,"[{'id': 3070, 'name': 'mercenary'}, {'id': 966..."
603692,"[{'id': 242, 'name': 'new york city'}, {'id': ..."
569094,"[{'id': 2858, 'name': 'sacrifice'}, {'id': 328..."
502356,"[{'id': 282, 'name': 'video game'}, {'id': 690..."


In [4]:
keywords.iloc[22]

keywords    [{'id': 160130, 'name': 'invisible person'}, {...
Name: 1131438, dtype: object

In [5]:
keywords.iloc[3398] = "[{'id': 5424, 'name': 'women's prison'}]"
keywords.iloc[8550] = "[{'id': 15101, 'name': 'based on children's book'}]"
keywords.iloc[9035] = "[{'id': 210450, 'name': 'noah's ark'}]"

In [6]:
keywords.iloc[2709].to_list()[0], keywords.iloc[4485].to_list()[0]

("[{'id': 18021, 'name': 'detroit, michigan'}]",
 "[{'id': 588, 'name': 'rome, italy'}]")

In [7]:
kw = keywords.iloc[2709].to_list()[0][1:-1]
l = re.findall("'name': '[\w()\-,\. ]+'", kw)
l2 = [i.replace("'name': ", "").replace("'", '') for i in l]
l, l2

(["'name': 'detroit, michigan'"], ['detroit, michigan'])

In [9]:
full_keywords_l = []
keywords_list_of_list = []
for i in range(keywords.shape[0]):
    kw = keywords.iloc[i].to_list()[0][1:-1]
    l = re.findall("'name': '[\w()\-,\. ]+'", kw)
    l2 = [i.replace("'name': ", "").replace("'", '') for i in l]
    if not l2:
        print(i)
    full_keywords_l.extend(l2)
    keywords_list_of_list.append(l2)

In [10]:
keywords_list_of_list[:5]

[['sequel', 'revenge', 'racing', 'family', 'cars'],
 ['mercenary',
  'sequel',
  'rescue mission',
  'long take',
  'based on graphic novel'],
 ['new york city',
  'martial arts',
  'hitman',
  'sequel',
  'organized crime',
  'osaka, japan',
  'aftercreditsstinger',
  'hunted',
  'professional assassin',
  'neo-noir',
  'berlin'],
 ['sacrifice',
  'villain',
  'comic book',
  'sequel',
  'superhero',
  'based on comic',
  'alternate dimension',
  'alternate version',
  'super power',
  'brooklyn, new york city',
  'superhero team',
  'spider bite',
  'super villain',
  'cliffhanger',
  'teen superhero',
  'alternate universe',
  'female superhero',
  'cartoon spider'],
 ['video game',
  'gorilla',
  'plumber',
  'magic mushroom',
  'anthropomorphism',
  'based on video game',
  'toad',
  'aftercreditsstinger',
  'duringcreditsstinger',
  'damsel in distress',
  'piano',
  'white gloves',
  'brother brother relationship',
  'evil king']]

In [160]:
keywords_list_of_str = [', '.join(l) for l in keywords_list_of_list] # bad idea with ', '
keywords_list_of_str[0]

'sequel, revenge, racing, family, cars'

In [20]:
len(full_keywords_l), len(keywords_list_of_list)

(82292, 9144)

In [21]:
unique_keywords = list(set(full_keywords_l))

In [22]:
for i in unique_keywords:
    if len(i)<3:
        print(i)

pc
gi
ax
ya
dj
بظ
3d
bp
iq
vr
rv
k2
살인
青春
uk


In [23]:
len(unique_keywords)

14239

In [24]:
keywords_counter = {}
l = []
for k in list(set(full_keywords_l)):
    for ks in keywords_list_of_list:
        if k in ks:
            l.extend(ks)
    keywords_counter[k] = dict(Counter(l))
    l = []

keywords_counter['sequel']

{'sequel': 457,
 'revenge': 31,
 'racing': 2,
 'family': 11,
 'cars': 1,
 'mercenary': 6,
 'rescue mission': 7,
 'long take': 1,
 'based on graphic novel': 1,
 'new york city': 12,
 'martial arts': 20,
 'hitman': 5,
 'organized crime': 2,
 'osaka, japan': 1,
 'aftercreditsstinger': 37,
 'hunted': 1,
 'professional assassin': 2,
 'neo-noir': 7,
 'berlin': 1,
 'sacrifice': 2,
 'villain': 10,
 'comic book': 1,
 'superhero': 31,
 'based on comic': 30,
 'alternate dimension': 5,
 'alternate version': 1,
 'super power': 16,
 'brooklyn, new york city': 2,
 'superhero team': 7,
 'spider bite': 1,
 'super villain': 2,
 'cliffhanger': 1,
 'teen superhero': 1,
 'alternate universe': 1,
 'female superhero': 1,
 'cartoon spider': 1,
 'hero': 7,
 'mad scientist': 4,
 'space opera': 5,
 'raccoon': 1,
 'duringcreditsstinger': 44,
 'marvel cinematic universe (mcu)': 12,
 'cosmic': 1,
 'outer space': 1,
 'chosen family': 1,
 'loss of loved one': 10,
 'dying and death': 3,
 'alien life-form': 4,
 'resurr

In [25]:
[] in keywords_list_of_list

False

In [50]:
with open('data/keywords_list_of_list.pickle', 'wb') as f:
    pk.dump(keywords_list_of_list, f, pk.HIGHEST_PROTOCOL)

In [51]:
with open('data/keywords_counter.pickle', 'wb') as f:
    pk.dump(keywords_counter, f, pk.HIGHEST_PROTOCOL)

In [28]:
df = pd.DataFrame(index=keywords.index, columns=['keywords_list'], data=np.array(keywords_list_of_list, dtype=object))
df.head()

Unnamed: 0_level_0,keywords_list
id,Unnamed: 1_level_1
385687,"[sequel, revenge, racing, family, cars]"
697843,"[mercenary, sequel, rescue mission, long take,..."
603692,"[new york city, martial arts, hitman, sequel, ..."
569094,"[sacrifice, villain, comic book, sequel, super..."
502356,"[video game, gorilla, plumber, magic mushroom,..."


In [29]:
df.shape

(9144, 1)

In [140]:
df.to_csv('data/keywords_list.csv')

In [142]:
df = pd.DataFrame(index=keywords.index, columns=['keywords_str'], data=np.array(keywords_list_of_str))
df.head()

Unnamed: 0_level_0,keywords_str
id,Unnamed: 1_level_1
385687,"sequel, revenge, racing, family, cars"
697843,"mercenary, sequel, rescue mission, long take, ..."
603692,"new york city, martial arts, hitman, sequel, o..."
569094,"sacrifice, villain, comic book, sequel, superh..."
502356,"video game, gorilla, plumber, magic mushroom, ..."


In [143]:
df.to_csv('data/keywords_str.csv')

### One-Hot Labeling

In [31]:
keywords_ind = pd.read_csv('data/keywords_list.csv', index_col='id')
kw_vectors = pd.DataFrame(index=keywords_ind.index, columns=unique_keywords)
kw_vectors.shape

(9144, 14239)

In [32]:
keywords_ind.head()

Unnamed: 0_level_0,keywords_list
id,Unnamed: 1_level_1
385687,"['sequel', 'revenge', 'racing', 'family', 'cars']"
697843,"['mercenary', 'sequel', 'rescue mission', 'lon..."
603692,"['new york city', 'martial arts', 'hitman', 's..."
569094,"['sacrifice', 'villain', 'comic book', 'sequel..."
502356,"['video game', 'gorilla', 'plumber', 'magic mu..."


In [146]:
keywords_ind.isna().sum()

keywords_str    0
dtype: int64

In [30]:
'osaka, japan' in unique_keywords

True

In [41]:
keywords_ind['keywords_list'].iloc[2][2:-2].split("', '")

['new york city',
 'martial arts',
 'hitman',
 'sequel',
 'organized crime',
 'osaka, japan',
 'aftercreditsstinger',
 'hunted',
 'professional assassin',
 'neo-noir',
 'berlin']

In [42]:
len_kw_v = len(unique_keywords)

for i in range(kw_vectors.shape[0]):
    df = np.array(np.array([0]*len_kw_v))
    
    for j in keywords_ind['keywords_list'].iloc[i][2:-2].split("', '"):
        df[unique_keywords.index(j)] = 1

    kw_vectors.iloc[i] = df
        
kw_vectors.iloc[0:10, 0:10]

Unnamed: 0_level_0,sea world,expulsion,storm at sea,one-sided love,mobile home,muscular dystrophy,street art,undead,classic,rude male lead
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
385687,0,0,0,0,0,0,0,0,0,0
697843,0,0,0,0,0,0,0,0,0,0
603692,0,0,0,0,0,0,0,0,0,0
569094,0,0,0,0,0,0,0,0,0,0
502356,0,0,0,0,0,0,0,0,0,0
667538,0,0,0,0,0,0,0,0,0,0
976573,0,0,0,0,0,0,0,0,0,0
536437,0,0,0,0,0,0,0,0,0,0
298618,0,0,0,0,0,0,0,0,0,0
447365,0,0,0,0,0,0,0,0,0,0


In [47]:
kw_vectors['martial arts'].sum()

200

In [48]:
kw_vectors.to_csv('data/one-hot_keywords.csv')

### Cosine Similarity

In [39]:
kw_vectors = pd.DataFrame(index=unique_keywords, )

array(['dangerous', 'dancefilm', 'antiquary', ..., 'sinkhole', 'guilty',
       'incident'], dtype='<U20')

---

## Cast

In [9]:
cast = pd.read_csv('../data/popular_10000_movies/movies_dataset_CLEAN.csv', index_col='id')
cast.head()

Unnamed: 0_level_0,genre_ids,overview,popularity,release_date,title,vote_average,vote_count,genres,cast,crew,keywords
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
385687,"[28, 80, 53]",Over many missions and against impossible odds...,4654.279,2023-05-17,Fast X,7.3,2093,"Action, Crime, Thriller","[{'cast_id': 12835, 'name': 'Vin Diesel', 'cha...","[{'crew_id': 1302, 'name': 'Susie Figgis', 'de...","[{'id': 9663, 'name': 'sequel'}, {'id': 9748, ..."
697843,"[28, 53]",Tasked with extracting a family who is at the ...,2494.01,2023-06-09,Extraction 2,7.7,910,"Action, Thriller","[{'cast_id': 74568, 'name': 'Chris Hemsworth',...","[{'crew_id': 950, 'name': 'Pietro Scalia', 'de...","[{'id': 3070, 'name': 'mercenary'}, {'id': 966..."
603692,"[28, 53, 80]","With the price on his head ever increasing, Jo...",1920.127,2023-03-22,John Wick: Chapter 4,7.9,3344,"Action, Thriller, Crime","[{'cast_id': 6384, 'name': 'Keanu Reeves', 'ch...","[{'crew_id': 3615, 'name': 'Manfred Banach', '...","[{'id': 242, 'name': 'new york city'}, {'id': ..."
569094,"[28, 12, 16, 878]","After reuniting with Gwen Stacy, Brooklyn’s fu...",2013.795,2023-05-31,Spider-Man: Across the Spider-Verse,8.6,1796,"Action, Adventure, Animation, Science Fiction","[{'cast_id': 587506, 'name': 'Shameik Moore', ...","[{'crew_id': 7624, 'name': 'Stan Lee', 'depart...","[{'id': 2858, 'name': 'sacrifice'}, {'id': 328..."
502356,"[16, 10751, 12, 14, 35]","While working underground to fix a water main,...",1539.037,2023-04-05,The Super Mario Bros. Movie,7.8,5165,"Animation, Family, Adventure, Fantasy, Comedy","[{'cast_id': 73457, 'name': 'Chris Pratt', 'ch...","[{'crew_id': 70851, 'name': 'Jack Black', 'dep...","[{'id': 282, 'name': 'video game'}, {'id': 690..."


In [13]:
cast = cast['cast'].to_frame()
cast.head()

Unnamed: 0_level_0,cast
id,Unnamed: 1_level_1
385687,"[{'cast_id': 12835, 'name': 'Vin Diesel', 'cha..."
697843,"[{'cast_id': 74568, 'name': 'Chris Hemsworth',..."
603692,"[{'cast_id': 6384, 'name': 'Keanu Reeves', 'ch..."
569094,"[{'cast_id': 587506, 'name': 'Shameik Moore', ..."
502356,"[{'cast_id': 73457, 'name': 'Chris Pratt', 'ch..."


In [23]:
cast.drop(cast[cast['cast']=='[]'].index, inplace=True)

In [24]:
cast[cast['cast']=='[]']

Unnamed: 0_level_0,cast
id,Unnamed: 1_level_1


In [20]:
cast.iloc[978].to_list()

['[]']

In [18]:
kw = cast.iloc[0].to_list()[0][1:-1]
l = re.findall("'name': '[\w()\-,\. ]+'", kw)
l2 = [i.replace("'name': ", "").replace("'", '') for i in l]
l2

['Vin Diesel',
 'Michelle Rodriguez',
 'Tyrese Gibson',
 'Ludacris',
 'John Cena',
 'Nathalie Emmanuel',
 'Jordana Brewster',
 'Sung Kang',
 'Jason Momoa',
 'Scott Eastwood',
 'Daniela Melchior',
 'Alan Ritchson',
 'Helen Mirren',
 'Brie Larson',
 'Jason Statham',
 'Charlize Theron',
 'Rita Moreno',
 'Joaquim de Almeida',
 'Leo A. Perry',
 'Luis Da Silva Jr.',
 'Jaz Hutchins',
 'Luka Hays',
 'Alexander Capon',
 'Pete Davidson',
 'Shadrach Agozino',
 'Ludmilla',
 'Miraj Grbić',
 'Meadow Walker Thornton-Allan',
 'Michael Irby',
 'Shahir Figueira',
 'Ben-Hur Santos',
 'Debby Ryan',
 'Josh Dun',
 'Robert Bastens',
 'Dwayne Johnson',
 'Gal Gadot']

In [25]:
full_cast_l = []
cast_list_of_list = []
for i in range(cast.shape[0]):
    kw = cast.iloc[i].to_list()[0][1:-1]
    l = re.findall("'name': '[\w()\-,\. ]+'", kw)
    l2 = [i.replace("'name': ", "").replace("'", '') for i in l]
    if not l2:
        print(i)
    full_cast_l.extend(l2)
    cast_list_of_list.append(l2)

In [26]:
len(full_cast_l), len(cast_list_of_list)

(311707, 9133)

In [27]:
unique_actors = list(set(full_cast_l))
len(unique_actors)

145215

In [None]:
actors_counter = {}
l = []
for k in list(set(full_cast_l)):
    for ks in cast_list_of_list:
        if k in ks:
            l.extend(ks)
    actors_counter[k] = dict(Counter(l))
    l = []

actors_counter['Jordana Brewster']

In [32]:
with open('data/cast_list_of_list.pickle', 'wb') as f:
    pk.dump(cast_list_of_list, f, pk.HIGHEST_PROTOCOL)

In [31]:
with open('data/actors_counter.pickle', 'wb') as f:
    pk.dump(actors_counter, f, pk.HIGHEST_PROTOCOL)

In [35]:
df = pd.DataFrame(index=cast.index, columns=['cast_list'], data=np.array(cast_list_of_list, dtype=object))

In [37]:
df.to_csv('data/cast_list.csv')