# Data Engineering

In [7]:

import pandas as pd
import requests
import os
import numpy as np
import tqdm
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from ast import literal_eval

## Data Related

In [4]:
root = os.path.dirname(os.path.abspath(os.getcwd()))

## Load Dataset

In [5]:
data = pd.read_csv(os.path.join(root, "data", "raw_data.csv"), lineterminator='\n', index_col=0)

In [6]:
data

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genres
0,False,/a2tys4sD7xzVaogPntGsT1ypVoT.jpg,"[53, 35, 80]",804150,en,Cocaine Bear,"Inspired by a true story, an oddball group of ...",3474.498,gOnmaxHo0412UVr1QM5Nekv1xPi.jpg,2023-02-22,Cocaine Bear,False,6.5,506,"['Thriller', 'Comedy', 'Crime']"
1,False,/i8dshLvq4LE3s0v8PrkDdUyb1ae.jpg,"[28, 53, 80]",603692,en,John Wick: Chapter 4,"With the price on his head ever increasing, Jo...",3361.154,vZloFAK7NmvMGKE7VkF5UHaz0I.jpg,2023-03-22,John Wick: Chapter 4,False,8.2,384,"['Action', 'Thriller', 'Crime']"
2,False,/wD2kUCX1Bb6oeIb2uz7kbdfLP6k.jpg,"[27, 53]",980078,en,Winnie the Pooh: Blood and Honey,Christopher Robin is headed off to college and...,3800.257,zCdzPK6fJgQL0FeKeYkciZzjyOL.jpg,2023-01-27,Winnie the Pooh: Blood and Honey,False,5.9,205,"['Horror', 'Thriller']"
3,False,/ovM06PdF3M8wvKb06i4sjW3xoww.jpg,"[878, 12, 28]",76600,en,Avatar: The Way of Water,Set more than a decade after the events of the...,3464.178,t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,2022-12-14,Avatar: The Way of Water,False,7.7,5982,"['Science Fiction', 'Adventure', 'Action']"
4,False,/ouB7hwclG7QI3INoYJHaZL4vOaa.jpg,"[16, 12, 35, 10751]",315162,en,Puss in Boots: The Last Wish,Puss in Boots discovers that his passion for a...,1862.289,kuf6dutpsT0vSVehic3EZIqkOBt.jpg,2022-12-07,Puss in Boots: The Last Wish,False,8.3,4854,"['Animation', 'Adventure', 'Comedy', 'Family']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9740,False,/5aLzi9TOpeItTIoXMcwUuAGs0wK.jpg,"[18, 28, 36, 10752]",10839,en,Cross of Iron,"It is 1943, and the German army—ravaged and de...",13.327,fR6pqh5aFn0fawsn7sR2zC15swx.jpg,1977-01-29,Cross of Iron,False,7.0,315,"['Drama', 'Action', 'History', 'War']"
9741,False,/22t90H0tXDvem199yjIkZnkX6KN.jpg,"[10751, 35, 18]",425194,de,Die kleine Hexe,The little witch is only 127 years old – too y...,12.836,vTRg6yy75VQ0yoszgLQgp3MTHtE.jpg,2018-02-01,The Little Witch,False,6.9,110,"['Family', 'Comedy', 'Drama']"
9742,False,/4zzuWYwejsrCTnqN3nM0XTf9B9w.jpg,"[10749, 35]",11472,en,Nine Months,When he finds out his longtime girlfriend is p...,11.577,z1nzYuoeTbGBrSicDxJEhJhEQz8.jpg,1995-07-12,Nine Months,False,5.7,576,"['Romance', 'Comedy']"
9743,False,/lu2WPWI9zzM0W2pivgXKIj4btRs.jpg,"[35, 12, 878]",964831,fr,Grand Paris,"Leslie, a young and idle suburbanite, drags hi...",19.386,zIqs0UnlILWV7ODxwewN1XogDPg.jpg,2023-03-29,Grand Paris,False,8.0,1,"['Comedy', 'Adventure', 'Science Fiction']"


## Generate One-Hot Encodings

In [11]:
# convert genre strings to list
data["genres"] = data["genres"].apply(literal_eval)

In [12]:
one_hot = MultiLabelBinarizer()

In [13]:
len(one_hot.fit_transform(data['genres']))

9745

In [14]:
print(one_hot.classes_)

['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'History' 'Horror' 'Music' 'Mystery' 'Romance'
 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']


In [15]:
genres_df = pd.DataFrame(one_hot.fit_transform(data['genres']), columns=one_hot.classes_)

## Check genre conversion

In [16]:
data.iloc[0]["genres"]

['Thriller', 'Comedy', 'Crime']

In [17]:
genres_df.head(1)

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


## Join Genres DF with Raw Data

In [18]:
data = data.join(genres_df)


In [19]:
data

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,False,/a2tys4sD7xzVaogPntGsT1ypVoT.jpg,"[53, 35, 80]",804150,en,Cocaine Bear,"Inspired by a true story, an oddball group of ...",3474.498,gOnmaxHo0412UVr1QM5Nekv1xPi.jpg,2023-02-22,...,0,0,0,0,0,0,0,1,0,0
1,False,/i8dshLvq4LE3s0v8PrkDdUyb1ae.jpg,"[28, 53, 80]",603692,en,John Wick: Chapter 4,"With the price on his head ever increasing, Jo...",3361.154,vZloFAK7NmvMGKE7VkF5UHaz0I.jpg,2023-03-22,...,0,0,0,0,0,0,0,1,0,0
2,False,/wD2kUCX1Bb6oeIb2uz7kbdfLP6k.jpg,"[27, 53]",980078,en,Winnie the Pooh: Blood and Honey,Christopher Robin is headed off to college and...,3800.257,zCdzPK6fJgQL0FeKeYkciZzjyOL.jpg,2023-01-27,...,0,1,0,0,0,0,0,1,0,0
3,False,/ovM06PdF3M8wvKb06i4sjW3xoww.jpg,"[878, 12, 28]",76600,en,Avatar: The Way of Water,Set more than a decade after the events of the...,3464.178,t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,2022-12-14,...,0,0,0,0,0,1,0,0,0,0
4,False,/ouB7hwclG7QI3INoYJHaZL4vOaa.jpg,"[16, 12, 35, 10751]",315162,en,Puss in Boots: The Last Wish,Puss in Boots discovers that his passion for a...,1862.289,kuf6dutpsT0vSVehic3EZIqkOBt.jpg,2022-12-07,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9740,False,/5aLzi9TOpeItTIoXMcwUuAGs0wK.jpg,"[18, 28, 36, 10752]",10839,en,Cross of Iron,"It is 1943, and the German army—ravaged and de...",13.327,fR6pqh5aFn0fawsn7sR2zC15swx.jpg,1977-01-29,...,1,0,0,0,0,0,0,0,1,0
9741,False,/22t90H0tXDvem199yjIkZnkX6KN.jpg,"[10751, 35, 18]",425194,de,Die kleine Hexe,The little witch is only 127 years old – too y...,12.836,vTRg6yy75VQ0yoszgLQgp3MTHtE.jpg,2018-02-01,...,0,0,0,0,0,0,0,0,0,0
9742,False,/4zzuWYwejsrCTnqN3nM0XTf9B9w.jpg,"[10749, 35]",11472,en,Nine Months,When he finds out his longtime girlfriend is p...,11.577,z1nzYuoeTbGBrSicDxJEhJhEQz8.jpg,1995-07-12,...,0,0,0,0,1,0,0,0,0,0
9743,False,/lu2WPWI9zzM0W2pivgXKIj4btRs.jpg,"[35, 12, 878]",964831,fr,Grand Paris,"Leslie, a young and idle suburbanite, drags hi...",19.386,zIqs0UnlILWV7ODxwewN1XogDPg.jpg,2023-03-29,...,0,0,0,0,0,1,0,0,0,0


## Need to drop unnecessary columns

In [22]:
data.columns.tolist()

['adult',
 'backdrop_path',
 'genre_ids',
 'id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'release_date',
 'title',
 'video',
 'vote_average',
 'vote_count',
 'genres',
 'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [23]:
columns_to_keep = [
 'id',
 'title',
 'original_language',
 'overview',
 'release_date',
 'adult',
 'poster_path',
 'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'
]

## Save prepped data

In [26]:
data[columns_to_keep].reset_index(drop=True).to_csv(os.path.join(root, "data", "data.csv"))

## Create train validation, and test splits

In [27]:
data = data[columns_to_keep].reset_index(drop=True)

In [28]:
train, validate, test = np.split(data.sample(frac=1), [int(.8*len(data)), int(.9*len(data))])

In [29]:
train

Unnamed: 0,id,title,original_language,overview,release_date,adult,poster_path,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
3021,804435,Vanquish,en,Victoria is a young mother trying to put her d...,2021-04-16,False,AoWY1gkcNzabh229Icboa1Ff0BM.jpg,1,0,0,...,0,0,0,0,0,0,0,1,0,0
8802,347183,Haikyuu!! The Movie: The End and the Beginning,ja,"Shouyou Hinata, after losing to Kitagawa Daiic...",2015-07-03,False,kWopdC6UbjcDGxCYwS2uIsoYl3b.jpg,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2785,59108,Tower Heist,en,A luxury condo manager leads a staff of worker...,2011-11-02,False,1HcOyMODYZXnncMikfhYkw3x2ss.jpg,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5224,2179,Tenacious D in The Pick of Destiny,en,"In Venice Beach, naive Midwesterner JB bonds w...",2006-11-22,False,7iixTGu21JEWP8QclwJLqMt2RBH.jpg,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7988,113,"Spring, Summer, Fall, Winter... and Spring",ko,"An isolated lake, where an old monk lives in a...",2003-09-19,False,6SQQ5REuAz7k0FMQ9mSCT40T2LN.jpg,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9106,11515,Goya's Ghosts,en,Painter Francisco Goya becomes involved with t...,2006-11-08,False,tyZhS8kA07oU5fO3ruamJkgAC0M.jpg,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5651,560044,The Willoughbys,en,When the four Willoughby children are abandone...,2020-04-22,False,9WrMmjdZvpxLQh1tCQ9tOd1asOb.jpg,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1437,537996,The Ballad of Buster Scruggs,en,Vignettes weaving together the stories of six ...,2018-11-09,False,voxl654m7p36y8FLu8oQD7dfwwK.jpg,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2313,30497,The Texas Chain Saw Massacre,en,When Sally hears that her grandfather's grave ...,1974-10-01,False,9s8uSm5K1W0vhGPHv2icM6SFib8.jpg,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [30]:
test

Unnamed: 0,id,title,original_language,overview,release_date,adult,poster_path,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
7805,529485,The Way Back,en,"A former basketball all-star, who has lost his...",2020-03-05,False,ylPnfaphW3FrLBUVwAREVtiL9My.jpg,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5850,633951,Confessions of a Hitman,fr,A paid assassin working for the biker gangs of...,2022-07-20,False,sFGkKtWq9rvb7AriX39369FRssP.jpg,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8039,10098,The Kid,en,A tramp cares for a boy after he's abandoned a...,1921-01-21,False,drgMcyTsySQBnUPGaBThCHGdlWT.jpg,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7836,472553,Zoo,en,Young Tom and his misfit friends fight to save...,2018-06-08,False,tUlO39EcZzPlKS4JOhlUansgKFJ.jpg,0,0,0,...,1,0,0,0,0,0,0,0,1,0
7338,64807,Grudge Match,en,A pair of aging boxing rivals are coaxed out o...,2013-12-25,False,jTq9jYTKKo6EhQPso4xTMpPYYXW.jpg,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,137,Groundhog Day,en,"A narcissistic TV weatherman, along with his a...",1993-02-11,False,katMtDlCbCIi7KheS2ENUjLOben.jpg,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9307,16456,The Crow: Wicked Prayer,en,When ex-convict Jimmy and his girlfriend are b...,2005-05-13,False,l6TfZVVzvrlGjxDlZK3CoHwaccc.jpg,1,0,0,...,0,1,0,0,0,0,0,1,0,0
9190,2210,Earth Girls Are Easy,en,"In this musical comedy, Valerie is dealing wit...",1988-09-08,False,pO9Edq8mgp9kD3N6EkCGYLnIVaA.jpg,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4695,957461,Chile '76,es,"Chile, 1976. Carmen heads off to her beach hou...",2022-10-20,False,e8cO013usqWgawu520n1OdmX3jp.jpg,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Save data splits

In [31]:
train.reset_index(drop=True).to_csv(os.path.join(root, "data", "train", "train.csv"))

In [32]:
test.reset_index(drop=True).to_csv(os.path.join(root, "data", "test", "test.csv"))

In [33]:
validate.reset_index(drop=True).to_csv(os.path.join(root, "data", "valid", "valid.csv"))