## Extract

In [39]:
#Import dependencies
import os
import pandas as pd
import time
import psycopg2
from sql_queries import *
from sqlalchemy import create_engine

In [40]:
#Name files
ExtractStart = time.time()
titlesFile = os.path.join("Data Branch","netflix_titles.csv")
moviesFile = os.path.join("Data Branch","movies_all_streaming.csv")
tvshowsFile = os.path.join("Data Branch","tv_shows_all_streaming.csv")

In [41]:
titles_df = pd.read_csv(titlesFile)

movies_df = pd.read_csv(moviesFile)

tvshows_df = pd.read_csv(tvshowsFile)

ExtractEnd = time.time()
ExtractTime = ExtractEnd-ExtractStart
TransformStart = ExtractEnd

## Transform

In [42]:
titlesClean = titles_df[["type","title","country","date_added","release_year","rating"]]

titlesClean = titlesClean.rename(columns={'release_year':'year'})

titlesClean['date_added'] = pd.to_datetime(titlesClean['date_added'], format="mixed")

print(titlesClean.head())

      type  title        country date_added  year rating
0  TV Show     3%         Brazil 2020-08-14  2020  TV-MA
1    Movie   7:19         Mexico 2016-12-23  2016  TV-MA
2    Movie  23:59      Singapore 2018-12-20  2011      R
3    Movie      9  United States 2017-11-16  2009  PG-13
4    Movie     21  United States 2020-01-01  2008  PG-13


Keep only Netflix

In [43]:
moviesClean = movies_df[
    ["Title","Year","Age","IMDb","Rotten Tomatoes","Netflix","Country"]
]
# num of rows with non netflix made
mC_all= len(moviesClean)
moviesClean = moviesClean.loc[moviesClean['Netflix']==1] 
# num of rows with only netflix made
mC_Netflix = len(moviesClean)
# drop the netflix column
moviesClean = moviesClean.drop(columns='Netflix')
moviesClean = moviesClean.rename(columns={
    'Title':'title',
    'Year':'year',
    'Age':'age',
    'IMDb':'imdb',
    'Rotten Tomatoes':"rotten_tomatoes",
    'Country':'country'})
print(moviesClean.head())

                            title  year  age  imdb rotten_tomatoes  \
0                       Inception  2010  13+   8.8             87%   
1                      The Matrix  1999  18+   8.7             87%   
2          Avengers: Infinity War  2018  13+   8.5             84%   
3              Back to the Future  1985   7+   8.5             96%   
4  The Good, the Bad and the Ugly  1966  18+   8.8             97%   

                        country  
0  United States,United Kingdom  
1                 United States  
2                 United States  
3                 United States  
4      Italy,Spain,West Germany  


In [44]:
movieMerge = pd.merge(titlesClean, moviesClean, on=['title','year'], how='inner')

movieMerge = movieMerge.dropna()

movieMerge = movieMerge.rename(columns={'country_x':"country"})
movieMerge = movieMerge.drop(columns='country_y')
movieMerge.reset_index(inplace=True,drop=True)

print(movieMerge.head())

    type       title                         country date_added  year rating  \
0  Movie           9                   United States 2017-11-16  2009  PG-13   
1  Movie          21                   United States 2020-01-01  2008  PG-13   
2  Movie        1922                   United States 2017-10-20  2017  TV-MA   
3  Movie      22-Jul  Norway, Iceland, United States 2018-10-10  2018      R   
4  Movie  13 Cameras                   United States 2016-08-13  2015     NR   

   age  imdb rotten_tomatoes  
0  13+   7.1             57%  
1  13+   6.8             36%  
2  18+   6.3             90%  
3  18+   6.8             80%  
4  18+   5.1             77%  


In [45]:
tvshowsClean = tvshows_df[["Title","Year","Age","IMDb","Rotten Tomatoes","Netflix"]]

tvC_all= len(tvshowsClean)

tvshowsClean = tvshowsClean.loc[tvshowsClean['Netflix']==1]
tvC_Netflix = len(tvshowsClean)
tvshowsClean = tvshowsClean.drop(columns='Netflix')
tvshowsClean = tvshowsClean.rename(columns={
    'Title':'title',
    'Year':'year',
    'Age':'age',
    'IMDb':'imdb',
    'Rotten Tomatoes':"rotten_tomatoes"
    })

print(tvshowsClean.head())

              title  year  age  imdb rotten_tomatoes
0      Breaking Bad  2008  18+   9.5             96%
1   Stranger Things  2016  16+   8.8             93%
2       Money Heist  2017  18+   8.4             91%
3          Sherlock  2010  16+   9.1             78%
4  Better Call Saul  2015  18+   8.7             97%


In [46]:
tvshowsMerge = pd.merge(titlesClean, tvshowsClean, on=['title','year'], how='inner')
tvM_before = len(tvshowsMerge)
tvshowsMerge = tvshowsMerge.dropna()
tvM_after = len(tvshowsMerge)

tvshowsMerge.reset_index(inplace=True,drop=True)
tvLoss = tvM_before - tvM_after

TransformEnd = time.time()
TransformTime = TransformEnd-TransformStart
LoadStart = TransformEnd

## Load

create netflixDB database

In [53]:
try:
    conn = psycopg2.connect("host=127.0.0.1 dbname=netflixdb user=postgres password=ODk2MC1kYXZpZG5h")
    print("Connected to the database!")
except psycopg2.Error as e:
    print("Unable to connect to the database:", e)


Connected to the database!


In [55]:
engine = create_engine('postgresql://postgres:ODk2MC1kYXZpZG5h@127.0.0.1/netflixdb')

In [57]:
# Writing movieMerge DataFrame to the 'final' table
movieMerge.to_sql('final', engine, if_exists='append', index=False)
print("movieMerge DataFrame written to PostgreSQL table 'final' successfully!")

# Writing tvshowsMerge DataFrame to the 'final' table
tvshowsMerge.to_sql('final', engine, if_exists='append', index=False)
print("tvshowsMerge DataFrame written to PostgreSQL table 'final' successfully!")

movieMerge DataFrame written to PostgreSQL table 'final' successfully!
tvshowsMerge DataFrame written to PostgreSQL table 'final' successfully!


In [58]:
final_df = pd.read_sql(sql='final',con=engine)
final_df.to_csv("Output/finalNetflix.csv")
final_len = len(final_df)