# Data Engineering

In [1]:

import pandas as pd
import requests
import os
import numpy as np
import tqdm
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from ast import literal_eval

## Data Related

In [2]:
root = os.path.dirname(os.path.abspath(os.getcwd()))

## Load Dataset

In [3]:
data = pd.read_csv(os.path.join(root, "data", "raw_data.csv"), lineterminator='\n', index_col=0)

In [4]:
data

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genres,genres_single
0,False,/9n2tJBplPbgR2ca05hS5CKXwP2c.jpg,"[16, 12, 10751, 14, 35]",502356,en,The Super Mario Bros. Movie,"While working underground to fix a water main,...",10058.912,qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,2023-04-05,The Super Mario Bros. Movie,False,7.5,410,"['Animation', 'Adventure', 'Family', 'Fantasy'...",['Animation']
1,False,/5i6SjyDbDWqyun8klUuCxrlFbyw.jpg,"[18, 28]",677179,en,Creed III,"After dominating the boxing world, Adonis Cree...",7413.386,vJU3rXSP9hwUuLeq8IpfsJShLOk.jpg,2023-03-01,Creed III,False,7.3,870,"['Drama', 'Action']",['Drama']
2,False,/ovM06PdF3M8wvKb06i4sjW3xoww.jpg,"[878, 12, 28]",76600,en,Avatar: The Way of Water,Set more than a decade after the events of the...,6686.292,t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,2022-12-14,Avatar: The Way of Water,False,7.7,6873,"['Science Fiction', 'Adventure', 'Action']",['Science Fiction']
3,False,/vSUls0b7dNhC7tJoExF1MBYWWyh.jpg,"[16, 35, 10751, 12, 14]",816904,es,Momias,"Through a series of unfortunate events, three ...",3562.319,qVdrYN8qu7xUtsdEFeGiIVIaYd.jpg,2023-01-05,Mummies,False,7.1,125,"['Animation', 'Comedy', 'Family', 'Adventure',...",['Animation']
4,False,/bT3IpP7OopgiVuy6HCPOWLuaFAd.jpg,"[35, 9648, 28]",638974,en,Murder Mystery 2,"After starting their own detective agency, Nic...",4873.950,swzMoIVn6xjB857ziYJ8KBV440g.jpg,2023-03-28,Murder Mystery 2,False,6.4,564,"['Comedy', 'Mystery', 'Action']",['Comedy']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9734,False,/3WKrovKHfJTJzYOqpYtCftr9iDX.jpg,[35],121342,it,Giovannona Coscialunga disonorata con onore,When a judge shuts down a high profile cheese ...,14.619,vkF8VLrazGtk9OjdEhihG6kKAhP.jpg,1973-04-12,Giovannona Long-Thigh,False,4.9,31,['Comedy'],['Comedy']
9735,False,/vzjgieFzT8ySkMdxdu3kIf4caZS.jpg,[53],11983,en,Proof of Life,Alice hires a professional negotiator to obtai...,13.690,yw8x2i3vaHZZzpvqvF75E8q2N6M.jpg,2000-12-08,Proof of Life,False,6.1,682,['Thriller'],['Thriller']
9736,False,/tdgce7OwwUwF7atRiar1y1AS6eh.jpg,"[18, 28, 35, 53, 80]",26715,en,Bitch Slap,"Three bad girls (a down-and-out stripper, a dr...",15.441,bFOmE3zCFU01TuomOOwClAWdvOD.jpg,2009-05-16,Bitch Slap,False,4.7,252,"['Drama', 'Action', 'Comedy', 'Thriller', 'Cri...",['Drama']
9737,False,/6ZNqWfUXdxiNVqwgO2doLlkmgQc.jpg,"[28, 80, 18, 53]",139567,en,Fire with Fire,A fireman takes an unexpected course of action...,15.442,kziBJGQFo9f0Vkj9s37qI0G9I0I.jpg,2012-08-31,Fire with Fire,False,5.7,600,"['Action', 'Crime', 'Drama', 'Thriller']",['Action']


## Generate One-Hot Encodings

In [5]:
# convert genre strings to list
data["genres"] = data["genres"].apply(literal_eval)
data["genres_single"] = data["genres_single"].apply(literal_eval)

In [6]:
one_hot = MultiLabelBinarizer()

In [7]:
len(one_hot.fit_transform(data['genres_single']))

9739

In [8]:
print(one_hot.classes_)

['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'History' 'Horror' 'Music' 'Mystery' 'Romance'
 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']


In [9]:
genres_df = pd.DataFrame(one_hot.fit_transform(data['genres_single']), columns=one_hot.classes_)

## Check genre conversion

In [10]:
data.iloc[0]["genres_single"]

['Animation']

In [11]:
genres_df.head(1)

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Join Genres DF with Raw Data

In [12]:
data = data.join(genres_df)


In [13]:
data

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,False,/9n2tJBplPbgR2ca05hS5CKXwP2c.jpg,"[16, 12, 10751, 14, 35]",502356,en,The Super Mario Bros. Movie,"While working underground to fix a water main,...",10058.912,qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,2023-04-05,...,0,0,0,0,0,0,0,0,0,0
1,False,/5i6SjyDbDWqyun8klUuCxrlFbyw.jpg,"[18, 28]",677179,en,Creed III,"After dominating the boxing world, Adonis Cree...",7413.386,vJU3rXSP9hwUuLeq8IpfsJShLOk.jpg,2023-03-01,...,0,0,0,0,0,0,0,0,0,0
2,False,/ovM06PdF3M8wvKb06i4sjW3xoww.jpg,"[878, 12, 28]",76600,en,Avatar: The Way of Water,Set more than a decade after the events of the...,6686.292,t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,2022-12-14,...,0,0,0,0,0,1,0,0,0,0
3,False,/vSUls0b7dNhC7tJoExF1MBYWWyh.jpg,"[16, 35, 10751, 12, 14]",816904,es,Momias,"Through a series of unfortunate events, three ...",3562.319,qVdrYN8qu7xUtsdEFeGiIVIaYd.jpg,2023-01-05,...,0,0,0,0,0,0,0,0,0,0
4,False,/bT3IpP7OopgiVuy6HCPOWLuaFAd.jpg,"[35, 9648, 28]",638974,en,Murder Mystery 2,"After starting their own detective agency, Nic...",4873.950,swzMoIVn6xjB857ziYJ8KBV440g.jpg,2023-03-28,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9734,False,/3WKrovKHfJTJzYOqpYtCftr9iDX.jpg,[35],121342,it,Giovannona Coscialunga disonorata con onore,When a judge shuts down a high profile cheese ...,14.619,vkF8VLrazGtk9OjdEhihG6kKAhP.jpg,1973-04-12,...,0,0,0,0,0,0,0,0,0,0
9735,False,/vzjgieFzT8ySkMdxdu3kIf4caZS.jpg,[53],11983,en,Proof of Life,Alice hires a professional negotiator to obtai...,13.690,yw8x2i3vaHZZzpvqvF75E8q2N6M.jpg,2000-12-08,...,0,0,0,0,0,0,0,1,0,0
9736,False,/tdgce7OwwUwF7atRiar1y1AS6eh.jpg,"[18, 28, 35, 53, 80]",26715,en,Bitch Slap,"Three bad girls (a down-and-out stripper, a dr...",15.441,bFOmE3zCFU01TuomOOwClAWdvOD.jpg,2009-05-16,...,0,0,0,0,0,0,0,0,0,0
9737,False,/6ZNqWfUXdxiNVqwgO2doLlkmgQc.jpg,"[28, 80, 18, 53]",139567,en,Fire with Fire,A fireman takes an unexpected course of action...,15.442,kziBJGQFo9f0Vkj9s37qI0G9I0I.jpg,2012-08-31,...,0,0,0,0,0,0,0,0,0,0


In [28]:
data.dtypes

id                    int64
title                object
original_language    object
overview             object
release_date         object
adult                  bool
genres_single        object
poster_path          object
Action                int64
Adventure             int64
Animation             int64
Comedy                int64
Crime                 int64
Documentary           int64
Drama                 int64
Family                int64
Fantasy               int64
History               int64
Horror                int64
Music                 int64
Mystery               int64
Romance               int64
Science Fiction       int64
TV Movie              int64
Thriller              int64
War                   int64
Western               int64
dtype: object

## Need to drop unnecessary columns

In [14]:
data.columns.tolist()

['adult',
 'backdrop_path',
 'genre_ids',
 'id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'release_date',
 'title',
 'video',
 'vote_average',
 'vote_count',
 'genres',
 'genres_single',
 'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [15]:
columns_to_keep = [
 'id',
 'title',
 'original_language',
 'overview',
 'release_date',
 'adult',
 'genres_single',
 'poster_path',
 'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'
]

## Save prepped data

In [16]:
data[columns_to_keep].reset_index(drop=True).to_csv(os.path.join(root, "data", "single_genre_data.csv"))

## Create train validation, and test splits

In [17]:
data = data[columns_to_keep].reset_index(drop=True)

In [18]:
train, validate, test = np.split(data.sample(frac=1), [int(.8*len(data)), int(.9*len(data))])

In [19]:
train

Unnamed: 0,id,title,original_language,overview,release_date,adult,genres_single,poster_path,Action,Adventure,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
3504,72808,The Flesh,it,A beautiful film which is basically about a ma...,1991-08-21,False,[Drama],5s9UarEgQWXHtZbm8fakXQnveRt.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
4706,688,The Bridges of Madison County,en,Photographer Robert Kincaid wanders into the l...,1995-05-28,False,[Drama],8TfLAfIh5Qxp2J4ZjOafHYhWtDb.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
4334,10957,The Black Cauldron,en,Taran is an assistant pigkeeper with boyish dr...,1985-07-24,False,[Animation],act8vtlXVEizdsUf9FcKbzSERew.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
9690,4787,Cassandra's Dream,en,The tale of two brothers with serious financia...,2007-06-18,False,[Crime],hydGWxW9VvRXMwpwFacl7XVtoeR.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
6720,521,Dial M for Murder,en,An ex-tennis pro carries out a plot to have hi...,1954-05-29,False,[Crime],2gDCAgl2iBQNuJuk6p2xtuS1ewg.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3839,8963,Crimson Tide,en,"After the Cold War, a breakaway Russian republ...",1995-05-12,False,[Thriller],21nqRJ6ofEgVvEl68J4O9V26Xzy.jpg,0,0,...,0,0,0,0,0,0,0,1,0,0
6007,1087617,Pride: A Seven Deadly Sins Story,en,Bakery owner and reality TV star Birdie Moore’...,2023-04-08,False,[Thriller],9e6uYx78fpkVysqOHzzCwLGW9SF.jpg,0,0,...,0,0,0,0,0,0,0,1,0,0
7360,12767,Cold Creek Manor,en,A family moves from New York into an old mansi...,2003-09-19,False,[Mystery],2AKitfwCoCmHU4PTtC5oF6LImlG.jpg,0,0,...,0,0,0,1,0,0,0,0,0,0
1058,8960,Hancock,en,Hancock is a down-and-out superhero who's forc...,2008-07-01,False,[Fantasy],7DyuV2G0hLEqHeueDfOqhZ2DVut.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
test

Unnamed: 0,id,title,original_language,overview,release_date,adult,genres_single,poster_path,Action,Adventure,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
9304,10253,Dragon Wars: D-War,ko,"Based on the Korean legend, unknown creatures ...",2007-08-01,False,[Fantasy],rEpvqWGu08coDLUpcpiz4sMoDYo.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
5588,13455,Push,en,"After his father, an assassin, is brutally mur...",2009-02-04,False,[Science Fiction],fHStfRjQGstuqtUz4Q22lbFU58M.jpg,0,0,...,0,0,0,0,0,1,0,0,0,0
9619,245168,Suffragette,en,Based on true events about the foot soldiers o...,2015-10-16,False,[Drama],vETQNfswR1R7SOIpGuFDv66xfb7.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
1844,239526,Buddha 2: The Endless Journey,ja,"2500 years ago, in India, Siddhartha was born ...",2014-02-08,False,[Animation],gG6c8TA6VsH5iDqAey8YNcwhQPt.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
4346,524071,Wai's Romance,cn,"Once a wealthy gentleman, Wai was flocked by m...",1994-01-01,False,[Comedy],duDOyfdgEOyd3e2LEzpG7dxA7WU.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9730,15267,The Beast of War,en,During the war in Afghanistan a Soviet tank cr...,1988-09-14,False,[War],LRVQAs14GFbr2vtZsaZht7dEqV.jpg,0,0,...,0,0,0,0,0,0,0,0,1,0
2053,49849,Cowboys & Aliens,en,A stranger stumbles into the desert town of Ab...,2011-07-29,False,[Action],9uZsGCP4rvOHVGCpMpYq5gNCuNI.jpg,1,0,...,0,0,0,0,0,0,0,0,0,0
2719,591120,The Pirates: The Last Royal Treasure,ko,A gutsy crew of Joseon pirates and bandits bat...,2022-01-26,False,[Action],d5oMUBCuDntOJBOertQ7aTSAGmP.jpg,1,0,...,0,0,0,0,0,0,0,0,0,0
6722,1125,Dreamgirls,en,A trio of female soul singers cross over to th...,2006-12-25,False,[Drama],sG5JyOj8Spe13QkNJMH8b5kzQUh.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0


## Save data splits

In [21]:
train.reset_index(drop=True).to_csv(os.path.join(root, "data", "train", "train.csv"))

In [22]:
test.reset_index(drop=True).to_csv(os.path.join(root, "data", "test", "test.csv"))

In [23]:
validate.reset_index(drop=True).to_csv(os.path.join(root, "data", "valid", "valid.csv"))