In [1]:
# Import dependencies
import pandas as pd

# Reading data in Pandas

### OMDb API data

In [2]:
# Reading json file from raw data to pandas
df_raw = pd.read_json('Output/OMDb_250.json')
# df_raw.head()

# Cleaning data by removing rows with no data available
omdb_df = df_raw.loc[df_raw['Error'].isna()]

omdb_df.head(2)

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,Error
0,The Shawshank Redemption,1994.0,R,14 Oct 1994,142 min,Drama,Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,...,9.3,2217195,tt0111161,movie,27 Jan 1998,,Columbia Pictures,,True,
1,The Godfather,1972.0,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay by), Francis Ford Coppo...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,...,9.2,1532092,tt0068646,movie,09 Oct 2001,,Paramount Pictures,,True,


### Utelly API data

In [3]:
# Reading csv file from raw data to pandas
df_raw_service = pd.read_csv('Output/streaming_df.csv')
df_raw_service.head(2)

Unnamed: 0,IMDb ID,Title,Streaming Service,Streaming URL
0,tt0111161,The Shawshank Redemption,Netflix,https://www.netflix.com/title/70005379
1,tt0111161,The Shawshank Redemption,Google Play,https://play.google.com/store/movies/details/T...


In [4]:
# Cleaning data by removing rows with no data available
df_raw_service = df_raw_service.dropna(how = 'any')

### Rapid API data

In [5]:
# Reading csv file from raw data to pandas
df_raw_netflix = pd.read_csv('Output/netflix_all_pages.csv')
df_raw_netflix.head(2)

Unnamed: 0.1,Unnamed: 0,netflixid,title,image,synopsis,rating,type,released,runtime,largeimage,unogsdate,imdbid,download
0,0,80192064,Luciano Mellera: Infantiloide,http://occ-0-2851-38.1.nflxso.net/dnm/api/v6/e...,Argentina&#39;s Luciano Mellera emphasizes the...,9.6,movie,2018,1h6m,,2018-07-07,tt7607400,0.0
1,1,81206389,Oththa Seruppu Size 7,https://occ-0-2851-38.1.nflxso.net/dnm/api/v6/...,"Taken into custody, a murder suspect&#39;s the...",9.4,movie,2019,1h43m,,2019-11-03,tt10370116,0.0


In [6]:
# Cleaning data by removing rows with no data available
df_raw_netflix = df_raw_netflix.dropna(how = 'any')

### Google Webscraping data

In [7]:
# Reading csv file from raw data to pandas
df_raw_google = pd.read_csv('Output/Complete_Google_Scraping.csv')
df_raw_google.head(2)

Unnamed: 0.1,Unnamed: 0,Title,Streaming On,Price
0,0,The Shawshank Redemption,YouTube,3.99
1,1,The Shawshank Redemption,iTunes,3.99


In [8]:
# Cleaning data by removing rows with no data available
df_raw_google = df_raw_google.dropna(how = 'any')

# Transform

### Creating top_imdb  - 1NF

In [9]:
# Show IMDB ID, Title
df_title = omdb_df[['imdbID', 'Title']].copy()

# Renaming column headers
df_title = df_title.rename(columns = {'imdbID': 'imdb_id',
                                     'Title': 'title'})
df_title

Unnamed: 0,imdb_id,title
0,tt0111161,The Shawshank Redemption
1,tt0068646,The Godfather
2,tt0071562,The Godfather: Part II
3,tt0468569,The Dark Knight
4,tt0050083,12 Angry Men
...,...,...
245,tt0064115,Butch Cassidy and the Sundance Kid
246,tt0103639,Aladdin
247,tt2338151,PK
248,tt0094625,Akira


In [10]:
# Saving DataFrame to csv file
df_title.to_csv('Output/csv_files/top_imdb.csv', index=False)

### Creating movie - 1NF

In [11]:
# Show IMDB ID, Title, release year, runtime, movie rating, imdb rating, production
df_movie = omdb_df[['imdbID', 'Title', 'Year', 'Runtime', 'Rated', 'imdbRating', 'Production']].copy()

# Setting index as imdb_rank column
df_movie.reset_index(level=0, inplace=True)

# Tranforming float number from year.0 to year
year_df = df_movie['Year'].astype(float).map("{:.0f}".format)
year_df = pd.DataFrame(year_df)

# Tranforming runtime column by splitting string 
runtime_df = df_movie['Runtime'].str.split(" ", expand=True)

# Naming columns
runtime_df.columns = ['number', 'string']

# Dropping minutes
runtime_df = runtime_df.drop(columns = 'string')

# Renaming column headers
df_movie = df_movie.rename(columns = {'index': 'imdb_rank',
                                     'imdbID': 'imdb_id',
                                     'Title': 'title',
                                     'Year': 'year-old',
                                     'Runtime': 'runtime-old',
                                     'Rated': 'rated',
                                     'imdbRating': 'imdb_rating',
                                     'Production': 'production'
                                    })

# Combining dataframes by assign function
df_combined_movie = df_movie.assign(year = year_df)
df_combined_movie = df_combined_movie.assign(runtime = runtime_df)

# Show IMDB ID, Title, release year, runtime, movie rating, imdb rating, production
df_combined_movie = df_combined_movie [['imdb_id','imdb_rank', 'title', 'year', 'runtime', 'rated', 'imdb_rating', 'production']]

# Cleaning data by removing rows with no data available
df_combined_movie = df_combined_movie.dropna(how = 'any')
df_combined_movie

Unnamed: 0,imdb_id,imdb_rank,title,year,runtime,rated,imdb_rating,production
0,tt0111161,0,The Shawshank Redemption,1994,142,R,9.3,Columbia Pictures
1,tt0068646,1,The Godfather,1972,175,R,9.2,Paramount Pictures
2,tt0071562,2,The Godfather: Part II,1974,202,R,9.0,Paramount Pictures
3,tt0468569,3,The Dark Knight,2008,152,PG-13,9.0,Warner Bros. Pictures/Legendary
4,tt0050083,4,12 Angry Men,1957,96,Approved,8.9,Criterion Collection
...,...,...,...,...,...,...,...,...
240,tt0064115,245,Butch Cassidy and the Sundance Kid,1969,110,PG,8.0,20th Century Fox
241,tt0103639,246,Aladdin,1992,90,G,8.0,Walt Disney Pictures
242,tt2338151,247,PK,2014,153,Not Rated,8.1,UTV Communications
243,tt0094625,248,Akira,1988,124,R,8.0,Streamline Pictures


In [12]:
# Saving DataFrame to csv file
df_combined_movie.to_csv('Output/csv_files/movie.csv', index=False)

### Creating movie_actor  - 1NF

In [13]:
# Show IMDB ID, Actors
df_actor = omdb_df[['imdbID', 'Actors']].copy()

# Dropping an N/A values
df_actor = df_actor.dropna(how = 'any')

# Creating an actor list to set up a conversion into a DataFrame
movie_actor_list = []

# Iterate through actors dataframe to clean up data into a dictionary
for index, row in df_actor.iterrows():

    all_actors = df_actor.loc[index,'Actors']
    actors_list = all_actors.split(', ')

    for actor in actors_list:
        
        movie_actor_dict = {}
        
        movie_actor_dict['imdb_id'] = df_actor.loc[index,'imdbID']
        movie_actor_dict['actor'] = actor
        
        movie_actor_list.append(movie_actor_dict)

# Convert actor list into a Dataframe
movie_actor_df = pd.DataFrame(movie_actor_list)

# Cleaning data by removing rows with no data available
movie_actor_df = movie_actor_df.dropna(how = 'any')
movie_actor_df

Unnamed: 0,imdb_id,actor
0,tt0111161,Tim Robbins
1,tt0111161,Morgan Freeman
2,tt0111161,Bob Gunton
3,tt0111161,William Sadler
4,tt0068646,Marlon Brando
...,...,...
972,tt0094625,Tesshô Genda
973,tt0050613,Toshirô Mifune
974,tt0050613,Isuzu Yamada
975,tt0050613,Takashi Shimura


In [14]:
# Saving DataFrame to csv file
movie_actor_df.to_csv('Output/csv_files/movie_actor.csv', index=False)

### Creating movie_director  - 1NF

In [15]:
# Show IMDB ID, Directors
df_director = omdb_df[['imdbID', 'Director']].copy()

# Dropping any N/A values
df_director = df_director.dropna(how = 'any')

# Creating a director list to set up a conversion into a DataFrame
movie_director_list = []

# Iterate through directors dataframe to clean up data into a dictionary
for index, row in df_director.iterrows():

    all_directors = df_director.loc[index,'Director']
    directors_list = all_directors.split(', ')

    for director in directors_list:
        
        movie_director_dict = {}
        
        movie_director_dict['imdb_id'] = df_director.loc[index,'imdbID']
        movie_director_dict['director'] = director
        
        movie_director_list.append(movie_director_dict)

# Convert director list into a Dataframe
movie_director_df = pd.DataFrame(movie_director_list)

# Cleaning data by removing rows with no data available
movie_director_df = movie_director_df.dropna(how = 'any')
movie_director_df

Unnamed: 0,imdb_id,director
0,tt0111161,Frank Darabont
1,tt0068646,Francis Ford Coppola
2,tt0071562,Francis Ford Coppola
3,tt0468569,Christopher Nolan
4,tt0050083,Sidney Lumet
...,...,...
265,tt0103639,Ron Clements
266,tt0103639,John Musker
267,tt2338151,Rajkumar Hirani
268,tt0094625,Katsuhiro Ôtomo


In [16]:
# Saving DataFrame to csv file
movie_director_df.to_csv('Output/csv_files/movie_director.csv', index=False)

### Creating Streaming Service Utelly  - 1NF

In [17]:
# Show Streaming service
df_service = df_raw_service['Streaming Service']
df_service = pd.DataFrame(df_service)

# Renaming column header
df_service = df_service.rename(columns= {'Streaming Service': 'service_name'})

# To see unique values in Streaming Service column of df_raw_service
df_service.service_name.unique()

array(['Netflix', 'Google Play', 'Amazon Instant Video', 'iTunes',
       'FandangoMoviesIVAUS', 'Amazon Prime Video', 'HBO',
       'DisneyPlusIVAUS', 'Hulu', 'AtomTicketsIVAUS'], dtype=object)

In [18]:
# Spliting service name column by splitting string
df_service = df_service['service_name'].str.split("IVAUS", expand=True)

# Renaming columns
df_service.columns = ['service_name', 'info']

# Group Streaming services
grp_service = df_service.groupby('service_name')
df_service = pd.DataFrame(grp_service)

# Renaming columns
df_service.columns = ['service_name', 'info']

# Dropping info column
df_service = df_service.drop(columns='info')

# Declaring a list for unqiue service id column
service_id = ['ss_1', 'ss_2', 'ss_3', 'ss_4', 'ss_5',
             'ss_6', 'ss_7', 'ss_8', 'ss_9', 'ss_10']

# Creating a service id column and adding the service_id list
df_service['service_id']= service_id

# Show service_id and service_name
df_service = df_service[['service_id', 'service_name']]

# Cleaning data by removing rows with no data available
df_service = df_service.dropna(how = 'any')
df_service

Unnamed: 0,service_id,service_name
0,ss_1,Amazon Instant Video
1,ss_2,Amazon Prime Video
2,ss_3,AtomTickets
3,ss_4,DisneyPlus
4,ss_5,FandangoMovies
5,ss_6,Google Play
6,ss_7,HBO
7,ss_8,Hulu
8,ss_9,Netflix
9,ss_10,iTunes


In [19]:
# Saving DataFrame to csv file
df_service.to_csv('Output/csv_files/streaming_service.csv', index=False)

### Creating Streaming Service Google  - 1NF

In [20]:
# Show Streaming service
df_service_google = df_raw_google['Streaming On']
df_service_google = pd.DataFrame(df_service_google)

# Renaming column header
df_service_google = df_service_google.rename(columns= {'Streaming On': 'service_name'})

# To see unique values in Streaming Service column of df_raw_service
df_service_google.service_name.unique()


array(['YouTube', 'iTunes', 'Google Play Movies & TV', 'Vudu',
       'Amazon Prime Video', 'Philo', 'Hulu', 'Netflix', 'HBO Now',
       'Sling TV', 'Tubi', 'Disney+', 'Crackle', 'fuboTV', 'Showtime',
       'Cinemax', 'Starz'], dtype=object)

In [21]:
# Group Streaming services
grp_service = df_service_google.groupby('service_name')
df_service_google= pd.DataFrame(grp_service)
df_service_google

# Renaming columns
df_service_google.columns = ['service_name', 'info']

# Dropping info column
df_service_google = df_service_google.drop(columns='info')

df_service_google 

# Declaring a list for unqiue service id column
service_google_id = ['gg_1', 'gg_2', 'gg_3', 'gg_4', 'gg_5',
             'gg_6', 'gg_7', 'gg_8', 'gg_9', 'gg_10', 'gg_11',
             'gg_12', 'gg_13', 'gg_14', 'gg_15', 'gg_16', 'gg_17']

# Creating a service id column and adding the service_id list
df_service_google['service_id']= service_google_id

# Show service_id and service_name
df_service_google = df_service_google[['service_id', 'service_name']]
df_service_google

Unnamed: 0,service_id,service_name
0,gg_1,Amazon Prime Video
1,gg_2,Cinemax
2,gg_3,Crackle
3,gg_4,Disney+
4,gg_5,Google Play Movies & TV
5,gg_6,HBO Now
6,gg_7,Hulu
7,gg_8,Netflix
8,gg_9,Philo
9,gg_10,Showtime


In [22]:
# Saving DataFrame to csv file
df_service_google.to_csv('Output/csv_files/google_streaming_service.csv', index=False)

### Creating streaming_service_utelly - 1NF

In [23]:
# Show IMDB id, Title, Streaming service
df_streaming = df_raw_service[['IMDb ID', 'Title', 'Streaming Service']].copy()

# Tranforming streaming service column by splitting string 
service_df = df_streaming['Streaming Service'].str.split("IVAUS", expand=True)

# Naming columns
service_df.columns = ['number', 'string']

# Dropping string column
service_df = service_df.drop(columns = 'string')

# Renaming column headers
df_streaming = df_streaming.rename(columns = {'IMDb ID': 'imdb_id',
                                     'Title': 'title',
                                     'Streaming Service': 'service_name'
                                    })

# Combining dataframes by assign function
df_combined_streaming = df_streaming.assign(service_name = service_df)

# Creating a blank series for the service_id column
service_id = pd.Series([])

# Running for loop and assigning values to service_id series
for i in range(len(df_combined_streaming)):
    if df_combined_streaming['service_name'][i] == 'Amazon Instant Video':
        service_id[i] = "ss_1"
    elif df_combined_streaming['service_name'][i] == 'Amazon Prime Video':
        service_id[i] = "ss_2"
    elif df_combined_streaming['service_name'][i] == 'AtomTickets':
        service_id[i] = "ss_3"  
    elif df_combined_streaming['service_name'][i] == 'DisneyPlus':
        service_id[i] = "ss_4" 
    elif df_combined_streaming['service_name'][i] == 'FandangoMovies':
        service_id[i] = "ss_5"      
    elif df_combined_streaming['service_name'][i] == 'Google Play':
        service_id[i] = "ss_6"  
    elif df_combined_streaming['service_name'][i] == 'HBO':
        service_id[i] = "ss_7" 
    elif df_combined_streaming['service_name'][i] == 'Hulu':
        service_id[i] = "ss_8"       
    elif df_combined_streaming['service_name'][i] == 'Netflix':
        service_id[i] = "ss_9" 
    elif df_combined_streaming['service_name'][i] == 'iTunes':
        service_id[i] = "ss_10"    
        
# Inserting new column from the service_id values
df_combined_streaming.insert(3, "service_id", service_id)

# Show service id and IMDB id
df_streaming_id = df_combined_streaming [['imdb_id','service_id']]
df_streaming_id

# Cleaning data by removing rows with no data available
df_service_utelly = df_streaming_id.dropna(how = 'any')
df_service_utelly

Unnamed: 0,imdb_id,service_id
0,tt0111161,ss_9
1,tt0111161,ss_6
2,tt0111161,ss_1
3,tt0111161,ss_10
4,tt0068646,ss_6
...,...,...
694,tt0103639,ss_6
695,tt0103639,ss_4
696,tt2338151,ss_9
697,tt0094625,ss_8


In [24]:
# Saving DataFrame to csv file
df_streaming_id.to_csv('Output/csv_files/utelly_movie_streaming.csv', index=False)

### Creating streaming_service_google - 1NF

In [25]:
# Show Title, Streaming On, Price
df_service_google = df_raw_google[['Title', 'Streaming On', 'Price' ]].copy()

# Renaming column headers
df_service_google= df_service_google.rename(columns = {'Title': 'title',
                                     'Streaming On': 'service_name',
                                     'Price': 'price'
                                    })

# To see unique values in Streaming Service column of df_raw_service
df_service_google.service_name.unique()

array(['YouTube', 'iTunes', 'Google Play Movies & TV', 'Vudu',
       'Amazon Prime Video', 'Philo', 'Hulu', 'Netflix', 'HBO Now',
       'Sling TV', 'Tubi', 'Disney+', 'Crackle', 'fuboTV', 'Showtime',
       'Cinemax', 'Starz'], dtype=object)

In [26]:
# Creating a blank series for the service_id column
google_service_id = pd.Series([])

# Running for loop and assigning values to service_id series
for i in range(len(df_service_google)):
    if df_service_google['service_name'][i] == 'Amazon Prime Video':
        google_service_id[i] = "gg_1"
    elif df_service_google['service_name'][i] == 'Cinemax':
        google_service_id[i] = "gg_2"
    elif df_service_google['service_name'][i] == 'Crackle':
        google_service_id[i] = "gg_3"  
    elif df_service_google['service_name'][i] == 'Disney+':
        google_service_id[i] = "gg_4" 
    elif df_service_google['service_name'][i] == 'fuboTV':
        google_service_id[i] = "gg_5"      
    elif df_service_google['service_name'][i] == 'Google Play Movies & TV':
        google_service_id[i] = "gg_6"  
    elif df_service_google['service_name'][i] == 'Hulu':
        google_service_id[i] = "gg_7" 
    elif df_service_google['service_name'][i] == 'HBO Now':
        google_service_id[i] = "gg_8"       
    elif df_service_google['service_name'][i] == 'iTunes':
        google_service_id[i] = "gg_9" 
    elif df_service_google['service_name'][i] == 'Netflix':
        google_service_id[i] = "gg_10"    
    elif df_service_google['service_name'][i] == 'Philo':
        google_service_id[i] = "gg_11"    
    elif df_service_google['service_name'][i] == 'Showtime':
        google_service_id[i] = "gg_12"   
    elif df_service_google['service_name'][i] == 'Sling TV':
        google_service_id[i] = "gg_13"   
    elif df_service_google['service_name'][i] == 'Starz':
        google_service_id[i] = "gg_14"
    elif df_service_google['service_name'][i] == 'Tubi':
        google_service_id[i] = "gg_15"
    elif df_service_google['service_name'][i] == 'Vudu':
        google_service_id[i] = "gg_16"
    elif df_service_google['service_name'][i] == 'YouTube':
        google_service_id[i] = "gg_17"
        
# Inserting new column from the service_id values
df_service_google.insert(3, "google_service_id", google_service_id)

# Show title, google service id, service name, price
df_service_google = df_service_google [['title','google_service_id', 'price']]

# Cleaning data by removing rows with no data available
df_service_google = df_service_google.dropna(how = 'any')
df_service_google

Unnamed: 0,title,google_service_id,price
0,The Shawshank Redemption,gg_17,3.99
1,The Shawshank Redemption,gg_9,3.99
2,The Shawshank Redemption,gg_6,3.99
3,The Shawshank Redemption,gg_16,3.99
4,The Shawshank Redemption,gg_1,3.99
...,...,...,...
1171,Akira,gg_16,2.99
1172,Akira,gg_9,5.99
1173,Akira,gg_7,Subscription
1174,Throne Of Blood,gg_9,3.99


In [27]:
# Saving DataFrame to csv file
df_service_google.to_csv('Output/csv_files/google_movie_streaming.csv', index=False)

### Creating netflix_movie - 1NF

In [28]:
# Show Netflix ID, IMDB ID, Title
df_netflix = df_raw_netflix[['netflixid', 'imdbid', 'title' ]].copy()

# Renaming column headers
df_netflix = df_netflix.rename(columns = {'netflixid': 'netflix_id',
                                     'imdbid': 'imdb_id',
                                    })

# Cleaning data by removing rows with no data available
df_netflix = df_netflix.dropna(how = 'any')
df_netflix

Unnamed: 0,netflix_id,imdb_id,title
2,553500,tt0060196,"The Good, the Bad and the Ugly"
3,70131314,tt1375666,Inception
8,20557937,tt0133093,The Matrix
17,60010110,tt0088763,Back to the Future
18,60031884,tt0064116,Once Upon a Time in the West
...,...,...,...
863,70114021,tt1019452,A Serious Man
864,80017021,tt2937898,A Most Violent Year
866,70130432,tt0081494,Return To The 36th Chamber
867,80025390,tt3007512,The Water Diviner


In [29]:
# Saving DataFrame to csv file
df_netflix.to_csv('Output/csv_files/netflix_movie.csv', index=False)