In [5]:
import pandas as pd
import os
from sqlalchemy import create_engine
import sys
sys.path.append("Resources/")
import config as c

In [6]:
# Import movies CSV
movies_csv = os.path.join("Resources", "streaming_movies.csv")
movies_df = pd.read_csv(movies_csv)
movies_df.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [7]:
# Drop unnamed column (used Stack Overflow for this)
movies_df.drop(movies_df.columns[movies_df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
movies_df.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [8]:
# Find missing data
movies_df.count()

ID                 16744
Title              16744
Year               16744
Age                 7354
IMDb               16173
Rotten Tomatoes     5158
Netflix            16744
Hulu               16744
Prime Video        16744
Disney+            16744
Type               16744
Directors          16018
Genres             16469
Country            16309
Language           16145
Runtime            16152
dtype: int64

In [9]:
# Drop rows missing data
movies_df.dropna(axis=0, how="any", inplace=True)
movies_df.count()

ID                 3301
Title              3301
Year               3301
Age                3301
IMDb               3301
Rotten Tomatoes    3301
Netflix            3301
Hulu               3301
Prime Video        3301
Disney+            3301
Type               3301
Directors          3301
Genres             3301
Country            3301
Language           3301
Runtime            3301
dtype: int64

In [10]:
# Find column types
movies_df.dtypes

ID                   int64
Title               object
Year                 int64
Age                 object
IMDb               float64
Rotten Tomatoes     object
Netflix              int64
Hulu                 int64
Prime Video          int64
Disney+              int64
Type                 int64
Directors           object
Genres              object
Country             object
Language            object
Runtime            float64
dtype: object

In [11]:
# Combine title and year which we will then use to create a unique ID for each film 
# Used code from "https://cmdlinetips.com/2018/11/how-to-join-two-text-columns-into-a-single-column-in-pandas/" to help with this
movies_df["Movie_Title"] = movies_df["Year"].astype(str).str.cat(movies_df["Title"],sep="-")
movies_df

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime,Movie_Title
0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0,2010-Inception
1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0,1999-The Matrix
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0,2018-Avengers: Infinity War
3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0,1985-Back to the Future
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0,"1966-The Good, the Bad and the Ugly"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16657,16658,A Kid in King Arthur's Court,1995,7+,4.7,5%,0,0,0,1,0,Michael Gottlieb,"Adventure,Comedy,Family,Fantasy,Romance","United States,Hungary,United Kingdom",English,89.0,1995-A Kid in King Arthur's Court
16671,16672,George of the Jungle 2,2003,7+,3.3,17%,0,0,0,1,0,David Grossman,"Adventure,Comedy,Family","United States,Australia",English,87.0,2003-George of the Jungle 2
16677,16678,That Darn Cat,1997,7+,4.7,13%,0,0,0,1,0,Robert Stevenson,"Comedy,Crime,Family,Thriller",United States,"English,French",116.0,1997-That Darn Cat
16687,16688,Kazaam,1996,7+,3.0,6%,0,0,0,1,0,Paul Michael Glaser,"Comedy,Family,Fantasy,Musical",United States,English,93.0,1996-Kazaam


In [12]:
# Use hash func to create unique ID, then add to dataframe
unique_movie_id = (movies_df['Movie_Title'].apply(hash))
movies_df.insert(0,"Unique_ID", unique_movie_id)
del movies_df["ID"]
del movies_df["Type"]
movies_df

Unnamed: 0,Unique_ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Directors,Genres,Country,Language,Runtime,Movie_Title
0,1303346622062471725,Inception,2010,13+,8.8,87%,1,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0,2010-Inception
1,-2336806198951893306,The Matrix,1999,18+,8.7,87%,1,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0,1999-The Matrix
2,2643726182000215063,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0,2018-Avengers: Infinity War
3,-3469068411298189364,Back to the Future,1985,7+,8.5,96%,1,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0,1985-Back to the Future
4,7008026840038389736,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0,"1966-The Good, the Bad and the Ugly"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16657,8502255239208301237,A Kid in King Arthur's Court,1995,7+,4.7,5%,0,0,0,1,Michael Gottlieb,"Adventure,Comedy,Family,Fantasy,Romance","United States,Hungary,United Kingdom",English,89.0,1995-A Kid in King Arthur's Court
16671,8846620306501842815,George of the Jungle 2,2003,7+,3.3,17%,0,0,0,1,David Grossman,"Adventure,Comedy,Family","United States,Australia",English,87.0,2003-George of the Jungle 2
16677,-7579263832803095153,That Darn Cat,1997,7+,4.7,13%,0,0,0,1,Robert Stevenson,"Comedy,Crime,Family,Thriller",United States,"English,French",116.0,1997-That Darn Cat
16687,-9055113055461608869,Kazaam,1996,7+,3.0,6%,0,0,0,1,Paul Michael Glaser,"Comedy,Family,Fantasy,Musical",United States,English,93.0,1996-Kazaam


In [13]:
movies_df.dtypes

Unique_ID            int64
Title               object
Year                 int64
Age                 object
IMDb               float64
Rotten Tomatoes     object
Netflix              int64
Hulu                 int64
Prime Video          int64
Disney+              int64
Directors           object
Genres              object
Country             object
Language            object
Runtime            float64
Movie_Title         object
dtype: object

In [14]:
movies_df = movies_df.rename(columns={"Unique_ID":"unique_id", "Title":"title", "Year":"year", "Age":"age", "IMDb":"imdb", "Rotten Tomatoes":"rotten_tomatoes", 
                                      "Netflix":"netflix", "Hulu":"hulu", "Prime Video":"prime_video", "Disney+":"disney_plus", "Directors":"directors", "Genres":"genres", 
                                      "Country":"country", "Language":"language", "Runtime":"runtime", "Movie_Title":"movie_title"})
movies_df

Unnamed: 0,unique_id,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney_plus,directors,genres,country,language,runtime,movie_title
0,1303346622062471725,Inception,2010,13+,8.8,87%,1,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0,2010-Inception
1,-2336806198951893306,The Matrix,1999,18+,8.7,87%,1,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0,1999-The Matrix
2,2643726182000215063,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0,2018-Avengers: Infinity War
3,-3469068411298189364,Back to the Future,1985,7+,8.5,96%,1,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0,1985-Back to the Future
4,7008026840038389736,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0,"1966-The Good, the Bad and the Ugly"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16657,8502255239208301237,A Kid in King Arthur's Court,1995,7+,4.7,5%,0,0,0,1,Michael Gottlieb,"Adventure,Comedy,Family,Fantasy,Romance","United States,Hungary,United Kingdom",English,89.0,1995-A Kid in King Arthur's Court
16671,8846620306501842815,George of the Jungle 2,2003,7+,3.3,17%,0,0,0,1,David Grossman,"Adventure,Comedy,Family","United States,Australia",English,87.0,2003-George of the Jungle 2
16677,-7579263832803095153,That Darn Cat,1997,7+,4.7,13%,0,0,0,1,Robert Stevenson,"Comedy,Crime,Family,Thriller",United States,"English,French",116.0,1997-That Darn Cat
16687,-9055113055461608869,Kazaam,1996,7+,3.0,6%,0,0,0,1,Paul Michael Glaser,"Comedy,Family,Fantasy,Musical",United States,English,93.0,1996-Kazaam


In [15]:
movies_df.set_index("unique_id", inplace=True)
movies_df

Unnamed: 0_level_0,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney_plus,directors,genres,country,language,runtime,movie_title
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1303346622062471725,Inception,2010,13+,8.8,87%,1,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0,2010-Inception
-2336806198951893306,The Matrix,1999,18+,8.7,87%,1,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0,1999-The Matrix
2643726182000215063,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0,2018-Avengers: Infinity War
-3469068411298189364,Back to the Future,1985,7+,8.5,96%,1,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0,1985-Back to the Future
7008026840038389736,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0,"1966-The Good, the Bad and the Ugly"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8502255239208301237,A Kid in King Arthur's Court,1995,7+,4.7,5%,0,0,0,1,Michael Gottlieb,"Adventure,Comedy,Family,Fantasy,Romance","United States,Hungary,United Kingdom",English,89.0,1995-A Kid in King Arthur's Court
8846620306501842815,George of the Jungle 2,2003,7+,3.3,17%,0,0,0,1,David Grossman,"Adventure,Comedy,Family","United States,Australia",English,87.0,2003-George of the Jungle 2
-7579263832803095153,That Darn Cat,1997,7+,4.7,13%,0,0,0,1,Robert Stevenson,"Comedy,Crime,Family,Thriller",United States,"English,French",116.0,1997-That Darn Cat
-9055113055461608869,Kazaam,1996,7+,3.0,6%,0,0,0,1,Paul Michael Glaser,"Comedy,Family,Fantasy,Musical",United States,English,93.0,1996-Kazaam


In [16]:
# Import series CSV
series_csv = os.path.join("Resources", "streaming_series.csv")
series_df = pd.read_csv(series_csv)
series_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1
1,1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1
2,2,Money Heist,2017,18+,8.4,91%,1,0,0,0,1
3,3,Sherlock,2010,16+,9.1,78%,1,0,0,0,1
4,4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,1


In [17]:
# Drop unnamed column (used Stack Overflow for this)
series_df.drop(series_df.columns[series_df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
series_df.head()

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1
1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1
2,Money Heist,2017,18+,8.4,91%,1,0,0,0,1
3,Sherlock,2010,16+,9.1,78%,1,0,0,0,1
4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,1


In [18]:
# Find missing data
series_df.count()

Title              5611
Year               5611
Age                3165
IMDb               4450
Rotten Tomatoes    1011
Netflix            5611
Hulu               5611
Prime Video        5611
Disney+            5611
type               5611
dtype: int64

In [19]:
# Drop rows missing data
series_df.dropna(axis=0, how="any", inplace=True)
series_df.count()

Title              931
Year               931
Age                931
IMDb               931
Rotten Tomatoes    931
Netflix            931
Hulu               931
Prime Video        931
Disney+            931
type               931
dtype: int64

In [20]:
# Find column types
series_df.dtypes

Title               object
Year                 int64
Age                 object
IMDb               float64
Rotten Tomatoes     object
Netflix              int64
Hulu                 int64
Prime Video          int64
Disney+              int64
type                 int64
dtype: object

In [21]:
# Combine title and year which we will then use to create a unique ID for each film 
# Used code from "https://cmdlinetips.com/2018/11/how-to-join-two-text-columns-into-a-single-column-in-pandas/" to help with this
series_df["Movie_Title"] = series_df["Year"].astype(str).str.cat(series_df["Title"],sep="-")
series_df

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type,Movie_Title
0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1,2008-Breaking Bad
1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1,2016-Stranger Things
2,Money Heist,2017,18+,8.4,91%,1,0,0,0,1,2017-Money Heist
3,Sherlock,2010,16+,9.1,78%,1,0,0,0,1,2010-Sherlock
4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,1,2015-Better Call Saul
...,...,...,...,...,...,...,...,...,...,...,...
5509,Diary of a Future President,2020,7+,5.5,100%,0,0,0,1,1,2020-Diary of a Future President
5517,Encore!,2019,7+,7.4,68%,0,0,0,1,1,2019-Encore!
5522,Spider-Man Unlimited,1999,7+,6.5,50%,0,0,0,1,1,1999-Spider-Man Unlimited
5530,The Super Hero Squad Show,2009,7+,6.1,50%,0,0,0,1,1,2009-The Super Hero Squad Show


In [22]:
# Use hash func to create unique ID, then add to dataframe
unique_id = (series_df['Movie_Title'].apply(hash))
series_df.insert(0,"Unique_ID", unique_id)
del series_df["type"]
series_df

Unnamed: 0,Unique_ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Movie_Title
0,-3965765654625591364,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,2008-Breaking Bad
1,-6340398744249624664,Stranger Things,2016,16+,8.8,93%,1,0,0,0,2016-Stranger Things
2,7843417095753012185,Money Heist,2017,18+,8.4,91%,1,0,0,0,2017-Money Heist
3,912085409955697397,Sherlock,2010,16+,9.1,78%,1,0,0,0,2010-Sherlock
4,8319423134046153458,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,2015-Better Call Saul
...,...,...,...,...,...,...,...,...,...,...,...
5509,-3199749247778268383,Diary of a Future President,2020,7+,5.5,100%,0,0,0,1,2020-Diary of a Future President
5517,6109180671582603841,Encore!,2019,7+,7.4,68%,0,0,0,1,2019-Encore!
5522,1145300978588487705,Spider-Man Unlimited,1999,7+,6.5,50%,0,0,0,1,1999-Spider-Man Unlimited
5530,3010507830707463366,The Super Hero Squad Show,2009,7+,6.1,50%,0,0,0,1,2009-The Super Hero Squad Show


In [23]:
# Rename columns
series_df = series_df.rename(columns={"Unique_ID":"unique_id", "Title":"title", "Year":"year", "Age":"age", "IMDb":"imdb", "Rotten Tomatoes":"rotten_tomatoes", 
                                      "Netflix":"netflix", "Hulu":"hulu", "Prime Video":"prime_video", "Disney+":"disney_plus", "Movie_Title":"movie_title"})
series_df

Unnamed: 0,unique_id,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney_plus,movie_title
0,-3965765654625591364,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,2008-Breaking Bad
1,-6340398744249624664,Stranger Things,2016,16+,8.8,93%,1,0,0,0,2016-Stranger Things
2,7843417095753012185,Money Heist,2017,18+,8.4,91%,1,0,0,0,2017-Money Heist
3,912085409955697397,Sherlock,2010,16+,9.1,78%,1,0,0,0,2010-Sherlock
4,8319423134046153458,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,2015-Better Call Saul
...,...,...,...,...,...,...,...,...,...,...,...
5509,-3199749247778268383,Diary of a Future President,2020,7+,5.5,100%,0,0,0,1,2020-Diary of a Future President
5517,6109180671582603841,Encore!,2019,7+,7.4,68%,0,0,0,1,2019-Encore!
5522,1145300978588487705,Spider-Man Unlimited,1999,7+,6.5,50%,0,0,0,1,1999-Spider-Man Unlimited
5530,3010507830707463366,The Super Hero Squad Show,2009,7+,6.1,50%,0,0,0,1,2009-The Super Hero Squad Show


In [24]:
series_df.set_index("unique_id", inplace=True)
series_df

Unnamed: 0_level_0,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney_plus,movie_title
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-3965765654625591364,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,2008-Breaking Bad
-6340398744249624664,Stranger Things,2016,16+,8.8,93%,1,0,0,0,2016-Stranger Things
7843417095753012185,Money Heist,2017,18+,8.4,91%,1,0,0,0,2017-Money Heist
912085409955697397,Sherlock,2010,16+,9.1,78%,1,0,0,0,2010-Sherlock
8319423134046153458,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,2015-Better Call Saul
...,...,...,...,...,...,...,...,...,...,...
-3199749247778268383,Diary of a Future President,2020,7+,5.5,100%,0,0,0,1,2020-Diary of a Future President
6109180671582603841,Encore!,2019,7+,7.4,68%,0,0,0,1,2019-Encore!
1145300978588487705,Spider-Man Unlimited,1999,7+,6.5,50%,0,0,0,1,1999-Spider-Man Unlimited
3010507830707463366,The Super Hero Squad Show,2009,7+,6.1,50%,0,0,0,1,2009-The Super Hero Squad Show


In [25]:
# Create database connection
streaming_connection = f"postgresql://{c.username}:{c.password}@localhost:5432/streaming_db"
engine = create_engine(streaming_connection)

In [26]:
engine.table_names()

['movies', 'series']

In [None]:
# Load dataframes into database
movies_df.to_sql(name="movies", con=engine, if_exists="append", index=True)
series_df.to_sql(name="series", con=engine, if_exists="append", index=True)