# Data Prepare

In [1]:
from collections import Counter
from itertools import combinations

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns

from jupyterthemes import jtplot
jtplot.style(figsize=(15, 9))

## Genres

### One-Hot Encoding

In [22]:
genres = pd.read_csv('../data/popular_10000_movies/movies_dataset_CLEAN.csv', index_col='id')

genres = genres['genres'].to_frame()
genres.dropna(inplace=True)

genres.head()

Unnamed: 0_level_0,genres
id,Unnamed: 1_level_1
385687,"Action, Crime, Thriller"
697843,"Action, Thriller"
603692,"Action, Thriller, Crime"
569094,"Action, Adventure, Animation, Science Fiction"
502356,"Animation, Family, Adventure, Fantasy, Comedy"


In [23]:
genres.shape

(9144, 1)

In [24]:
genres.isna().sum()

genres    0
dtype: int64

In [25]:
genres['genres'].nunique()

2131

In [26]:
unique_genres = [
    'Action',
    'Adventure',
    'Animation',
    'Comedy',
    'Crime',
    'Documentary',
    'Drama',
    'Family',
    'Fantasy',
    'History',
    'Horror',
    'Music',
    'Mystery',
    'Romance',
    'Science Fiction',
    'TV Movie',
    'Thriller',
    'War',
    'Western'
]

In [27]:
genres_vectors = pd.DataFrame(index=genres.index, columns=unique_genres)
genres_vectors.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,,,,,,,,,,,,,,,,,,,
697843,,,,,,,,,,,,,,,,,,,
603692,,,,,,,,,,,,,,,,,,,
569094,,,,,,,,,,,,,,,,,,,
502356,,,,,,,,,,,,,,,,,,,


In [28]:
for i in range(genres.shape[0]):
    df = np.full(len(unique_genres), 0)
    
    for j in str(genres['genres'].iloc[i]).split(', '):
        df[unique_genres.index(j)] = 1

    genres_vectors.iloc[i] = df
        
genres_vectors.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
697843,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
603692,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
569094,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
502356,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


### Labels for Genres

In [29]:
# if we have that genre in list it will be dominant, from most to least
labels = [
    'Documentary',
    'Music',
    'Animation',
    'Family',
    'Science Fiction',
    'Horror',
    'History',
    'Thriller',
]

In [30]:
labels_df = pd.DataFrame(index=genres.index, columns=['label'])
labels_df.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
385687,
697843,
603692,
569094,
502356,


In [31]:
for i in range(genres.shape[0]):
    genres_l = str(genres['genres'].iloc[i]).split(', ')
    labels_df.iloc[i] = genres_l[0]
    for j in labels:
        if j in genres_l:
            labels_df.iloc[i] = j
            break
        
labels_df.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
385687,Thriller
697843,Thriller
603692,Thriller
569094,Animation
502356,Animation


In [32]:
le = LabelEncoder()
le.fit(labels_df['label'])

labels_df['label_encoder'] = le.transform(labels_df['label'])
labels_df.head()

Unnamed: 0_level_0,label,label_encoder
id,Unnamed: 1_level_1,Unnamed: 2_level_1
385687,Thriller,16
697843,Thriller,16
603692,Thriller,16
569094,Animation,2
502356,Animation,2


In [33]:
labels_df.shape

(9144, 2)

### Add Labels to Genres

In [34]:
df = pd.merge(genres_vectors, labels_df, on='id', how='inner')
df.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,...,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,label,label_encoder
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
385687,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Thriller,16
697843,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Thriller,16
603692,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Thriller,16
569094,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,Animation,2
502356,0,1,1,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,Animation,2


In [35]:
df.shape

(9144, 21)

In [37]:
df.to_csv('data/one-hot_genres_w_labels.csv')

### Cosine Similarity

In [43]:
genres_list = list(genres['genres'].unique())

In [44]:
for i, g in enumerate(genres_list):
    genres_list[i] = g.split(', ')

In [45]:
genres_list[:20]

[['Action', 'Crime', 'Thriller'],
 ['Action', 'Thriller'],
 ['Action', 'Thriller', 'Crime'],
 ['Action', 'Adventure', 'Animation', 'Science Fiction'],
 ['Animation', 'Family', 'Adventure', 'Fantasy', 'Comedy'],
 ['Action', 'Adventure', 'Science Fiction'],
 ['Animation', 'Comedy', 'Family', 'Fantasy', 'Romance'],
 ['Mystery', 'Thriller', 'Science Fiction'],
 ['Science Fiction', 'Action', 'Adventure'],
 ['Action', 'Science Fiction', 'Adventure'],
 ['Science Fiction', 'Adventure', 'Action'],
 ['Action', 'Horror', 'Thriller'],
 ['Adventure', 'Family', 'Fantasy', 'Romance'],
 ['Thriller', 'Horror'],
 ['Horror', 'Thriller'],
 ['War', 'Action', 'Thriller'],
 ['Drama', 'Romance'],
 ['Action', 'Fantasy', 'Thriller'],
 ['Comedy', 'Action', 'Fantasy'],
 ['Comedy', 'Drama']]

In [46]:
un_genres_list = [genre for genres in genres_list for genre in genres]

In [47]:
len(genres_list), len(un_genres_list)

(2131, 7856)

In [48]:
un_genres_list = list(set(un_genres_list))

In [49]:
len(un_genres_list)

19

In [50]:
un_genres_list

['TV Movie',
 'Thriller',
 'Action',
 'Mystery',
 'Crime',
 'Documentary',
 'Adventure',
 'Science Fiction',
 'Drama',
 'Animation',
 'Music',
 'Fantasy',
 'Family',
 'Romance',
 'Western',
 'War',
 'History',
 'Horror',
 'Comedy']

In [51]:
genres_list[:20]

[['Action', 'Crime', 'Thriller'],
 ['Action', 'Thriller'],
 ['Action', 'Thriller', 'Crime'],
 ['Action', 'Adventure', 'Animation', 'Science Fiction'],
 ['Animation', 'Family', 'Adventure', 'Fantasy', 'Comedy'],
 ['Action', 'Adventure', 'Science Fiction'],
 ['Animation', 'Comedy', 'Family', 'Fantasy', 'Romance'],
 ['Mystery', 'Thriller', 'Science Fiction'],
 ['Science Fiction', 'Action', 'Adventure'],
 ['Action', 'Science Fiction', 'Adventure'],
 ['Science Fiction', 'Adventure', 'Action'],
 ['Action', 'Horror', 'Thriller'],
 ['Adventure', 'Family', 'Fantasy', 'Romance'],
 ['Thriller', 'Horror'],
 ['Horror', 'Thriller'],
 ['War', 'Action', 'Thriller'],
 ['Drama', 'Romance'],
 ['Action', 'Fantasy', 'Thriller'],
 ['Comedy', 'Action', 'Fantasy'],
 ['Comedy', 'Drama']]

In [52]:
genres_counter = {}
l = []
for g in un_genres_list:
    for gs in genres_list:
        if g in gs:
            l.extend(gs)
    genres_counter[g] = dict(Counter(l))
    l = []

genres_counter['Animation']

{'Action': 129,
 'Adventure': 216,
 'Animation': 407,
 'Science Fiction': 99,
 'Family': 298,
 'Fantasy': 160,
 'Comedy': 215,
 'Romance': 26,
 'Music': 38,
 'Drama': 31,
 'TV Movie': 36,
 'Crime': 14,
 'Thriller': 12,
 'Horror': 18,
 'Mystery': 40,
 'Western': 7,
 'War': 5,
 'History': 4}

In [53]:
genres_df = pd.DataFrame(columns=un_genres_list, index=un_genres_list)
genres_df.head()

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,,,,,,,,,,,,,,,,,,,
Thriller,,,,,,,,,,,,,,,,,,,
Action,,,,,,,,,,,,,,,,,,,
Mystery,,,,,,,,,,,,,,,,,,,
Crime,,,,,,,,,,,,,,,,,,,


In [54]:
for genre in un_genres_list:
    g = {}
    for u_genre in un_genres_list:
        if u_genre in genres_counter[genre].keys():
            g[u_genre] =  genres_counter[genre][u_genre]
        else:
            g[u_genre] = 0
    genres_df.loc[genre] = g

In [55]:
genres_df.head()

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,176,25,38,12,10,3,44,39,55,36,23,46,77,22,3,5,10,20,79
Thriller,25,615,264,196,197,1,111,154,285,12,4,73,8,72,19,38,38,159,98
Action,38,264,759,73,153,5,348,234,208,129,8,152,137,52,38,66,50,115,210
Mystery,12,196,73,342,98,3,59,72,141,40,5,52,41,36,6,6,9,84,85
Crime,10,197,153,98,348,3,43,25,166,14,9,15,15,39,13,2,19,36,116


In [56]:
cs_df = pd.DataFrame(columns=genres_df.index, index=genres_df.index)
cs_df.head()

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,,,,,,,,,,,,,,,,,,,
Thriller,,,,,,,,,,,,,,,,,,,
Action,,,,,,,,,,,,,,,,,,,
Mystery,,,,,,,,,,,,,,,,,,,
Crime,,,,,,,,,,,,,,,,,,,


In [69]:
def get_cosine_similarity_mertric(df1, df2):
    return np.dot(df1, df2)/ \
    (np.sqrt(np.power(df1, 2).sum()) * np.sqrt(np.power(df2, 2).sum()))

In [70]:
for i in genres_df.columns:
    for j in genres_df.columns:
        cs_df.loc[i][j] = get_cosine_similarity_mertric(genres_df[i], genres_df[j])

In [71]:
cs_df

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,1.0,0.394733,0.513236,0.39673,0.364897,0.273702,0.592527,0.53523,0.511371,0.599006,0.580721,0.61596,0.676505,0.495483,0.354297,0.326719,0.332524,0.410583,0.677208
Thriller,0.394733,1.0,0.717784,0.803086,0.803562,0.252943,0.478655,0.632004,0.747659,0.276127,0.26195,0.419476,0.280956,0.495854,0.468874,0.527351,0.504018,0.717057,0.440748
Action,0.513236,0.717784,1.0,0.554684,0.650536,0.291962,0.797106,0.781199,0.623967,0.589238,0.364484,0.625936,0.571553,0.473524,0.555539,0.573059,0.479258,0.612385,0.641312
Mystery,0.39673,0.803086,0.554684,1.0,0.702164,0.263122,0.456686,0.554591,0.662259,0.365163,0.314863,0.449878,0.36879,0.471931,0.387748,0.379926,0.384377,0.646352,0.491783
Crime,0.364897,0.803562,0.650536,0.702164,1.0,0.289151,0.422055,0.450467,0.70113,0.281047,0.310511,0.34936,0.297321,0.481107,0.460968,0.420063,0.462907,0.507797,0.503363
Documentary,0.273702,0.252943,0.291962,0.263122,0.289151,1.0,0.247916,0.216827,0.337446,0.169529,0.257017,0.192158,0.198267,0.221919,0.214639,0.240606,0.312933,0.220365,0.274301
Adventure,0.592527,0.478655,0.797106,0.456686,0.422055,0.247916,1.0,0.719617,0.537524,0.801408,0.530574,0.794288,0.831916,0.530338,0.53405,0.451217,0.386237,0.464983,0.784823
Science Fiction,0.53523,0.632004,0.781199,0.554591,0.450467,0.216827,0.719617,1.0,0.512783,0.598825,0.354121,0.613756,0.575048,0.428716,0.385528,0.382743,0.295121,0.641385,0.601664
Drama,0.511371,0.747659,0.623967,0.662259,0.70113,0.337446,0.537524,0.512783,1.0,0.351065,0.497333,0.496137,0.432878,0.725442,0.603654,0.706889,0.748685,0.516213,0.542533
Animation,0.599006,0.276127,0.589238,0.365163,0.281047,0.169529,0.801408,0.598825,0.351065,1.0,0.590689,0.775147,0.926371,0.421453,0.339857,0.239284,0.19883,0.339585,0.789651


In [72]:
cs_df.to_csv('data/genres_cosine_similarity.csv')

---

## Keywords

In [77]:
keywords = pd.read_csv('../data/popular_10000_movies/movies_dataset_CLEAN.csv', index_col='id')
keywords = keywords['keywords'].to_frame()

In [78]:
keywords.head()

Unnamed: 0_level_0,keywords
id,Unnamed: 1_level_1
385687,"[{'id': 9663, 'name': 'sequel'}, {'id': 9748, ..."
697843,"[{'id': 3070, 'name': 'mercenary'}, {'id': 966..."
603692,"[{'id': 242, 'name': 'new york city'}, {'id': ..."
569094,"[{'id': 2858, 'name': 'sacrifice'}, {'id': 328..."
502356,"[{'id': 282, 'name': 'video game'}, {'id': 690..."


In [102]:
keywords.iloc[0].to_list()[0][1:-1]

"{'id': 9663, 'name': 'sequel'}, {'id': 9748, 'name': 'revenge'}, {'id': 10039, 'name': 'racing'}, {'id': 18035, 'name': 'family'}, {'id': 286354, 'name': 'cars'}"