In [1]:
import pandas as pd
from sqlalchemy import create_engine
from secret import username, password

### Extract CSVs into DataFrames

In [2]:
# Import the IMDB names CSV files

directors_file = "../etl_project_data/IMDb names.csv"
directors_df = pd.read_csv(directors_file)


In [3]:
directors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175715 entries, 0 to 175714
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imdb_name_id           175715 non-null  object 
 1   name                   175715 non-null  object 
 2   birth_name             175715 non-null  object 
 3   height                 30080 non-null   float64
 4   bio                    122832 non-null  object 
 5   birth_details          75229 non-null   object 
 6   birth_year             75111 non-null   float64
 7   date_of_birth          75102 non-null   object 
 8   place_of_birth         71122 non-null   object 
 9   death_details          26862 non-null   object 
 10  death_year             26838 non-null   float64
 11  date_of_death          26855 non-null   object 
 12  place_of_death         25036 non-null   object 
 13  reason_of_death        11972 non-null   object 
 14  spouses                175715 non-nu

### Transform premise DataFrame

In [4]:
# Create a filtered dataframe from specific columns

directors_cols = ["name","primary_profession"]

directors_transformed= directors_df[directors_cols].copy()

In [5]:
# normalize profession
directors_merge= directors_transformed.set_index('name').primary_profession.str.split(',', expand=True).stack().reset_index('name')




In [6]:
# Rename the column to name
directors_merge = directors_merge.rename(columns={0:"Profession"}).copy()
directors_merge

Unnamed: 0,name,Profession
0,Fred Astaire,soundtrack
1,Fred Astaire,actor
2,Fred Astaire,miscellaneous
0,Lauren Bacall,actress
1,Lauren Bacall,soundtrack
...,...,...
0,McMagic Cardenas,actor
0,Rashaduzzman Shohag,editorial_department
0,William Plyler,producer
1,William Plyler,actor


In [7]:
# Filter rows with profession as director

directors_filtered = directors_merge.loc[directors_merge['Profession'] == "director" ]
directors_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35652 entries, 1 to 0
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        35652 non-null  object
 1   Profession  35652 non-null  object
dtypes: object(2)
memory usage: 835.6+ KB


In [8]:
directors_clean = directors_filtered[["name"]].copy()


### Extract director facebook likes from metadata csv file

In [9]:
# Import the metadata CSV files

metadata_file = "../etl_project_data/movie_metadata.csv"
metadata_df = pd.read_csv(metadata_file)

In [10]:
metadata_df

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
5039,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
5040,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
5041,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [11]:
 ## Create a filtered dataframe from specific columns

metadata_cols = ["director_name","director_facebook_likes"]

metadata_transformed= metadata_df[metadata_cols].copy()

In [12]:
# Rename the column to name
metadata_transformed = metadata_transformed.rename(columns={"director_name":"name"}).copy()
metadata_transformed

Unnamed: 0,name,director_facebook_likes
0,James Cameron,0.0
1,Gore Verbinski,563.0
2,Sam Mendes,0.0
3,Christopher Nolan,22000.0
4,Doug Walker,131.0
...,...,...
5038,Scott Smith,2.0
5039,,
5040,Benjamin Roberds,0.0
5041,Daniel Hsia,0.0


In [13]:
## left join to get director facebook likes

director_FB_merge = pd.merge(left=directors_clean, right=metadata_transformed, how='left', left_on='name', right_on='name')
director_FB_merge

Unnamed: 0,name,director_facebook_likes
0,Ingmar Bergman,0.0
1,Marlon Brando,
2,James Cagney,
3,Federico Fellini,
4,John Gielgud,
...,...,...
37950,Dominic Smith,
37951,Ricky Umberger,
37952,Ramesh Khanna,
37953,Premji,


In [14]:
#unique director names

director_FB_merge= director_FB_merge.groupby(["name"]).count().copy()
director_FB_merge= director_FB_merge.reset_index()
director_FB_merge

Unnamed: 0,name,director_facebook_likes
0,'Philthy' Phil Phillips,0
1,A Leslie Kies,0
2,A Normale Jef,0
3,A. Bhimsingh,0
4,A. Edward Sutherland,0
...,...,...
35503,Ümit Utku,0
35504,Ümit Ünal,0
35505,Þorsteinn Gunnar Bjarnason,0
35506,Þráinn Bertelsson,0


In [15]:
director_FB_merge['director_id'] = range(1, len(director_FB_merge)+1)
director_FB_merge.head()

Unnamed: 0,name,director_facebook_likes,director_id
0,'Philthy' Phil Phillips,0,1
1,A Leslie Kies,0,2
2,A Normale Jef,0,3
3,A. Bhimsingh,0,4
4,A. Edward Sutherland,0,5


In [16]:
#director_FB_merge.to_csv("Resources/director_FB_merge.csv", encoding='utf-8', index=True)

# Create dataframe for Movie_director table

In [17]:
#read IMBD Movies excel file
movies_file = "../etl_project_data/IMDb movies.csv"
movies_data_df = pd.read_csv(movies_file)
movies_data_df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [18]:
#filter data from 2000 to 2019
movies_data_df.sort_values(by=['year'], ascending=True, inplace=True)
movies_filtered= movies_data_df[~(movies_data_df['year'] < 2000)]
movies_filtered

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
32874,tt0162711,Trixie,Trixie,2000,2000-08-18,"Comedy, Crime, Mystery",116,USA,English,Alan Rudolph,...,"Emily Watson, Dermot Mulroney, Nick Nolte, Nat...","An eccentric, unconventional woman whose naive...",5.0,1376,,$ 295683,$ 295683,26.0,35.0,37.0
32948,tt0163983,Bless the Child,Bless the Child,2000,2001-01-05,"Crime, Drama, Horror",107,"USA, Germany",English,Chuck Russell,...,"Kim Basinger, Jimmy Smits, Holliston Coleman, ...","Cody, a little girl abandoned by her mother an...",5.1,13440,$ 65000000,$ 29381494,$ 40443010,17.0,193.0,103.0
35035,tt0201726,The Last Producer,The Last Producer,2000,2000-08-22,"Comedy, Drama",90,USA,English,Burt Reynolds,...,"Sean Astin, David Atkinson, Leslie Bega, Benja...",An old-time mogul struggles to reenter the clu...,4.7,265,,,,,9.0,3.0
35036,tt0201737,Lost in the Pershing Point Hotel,Lost in the Pershing Point Hotel,2000,2000-06-09,"Comedy, Drama",107,USA,English,Julia Jay Pierrepont III,...,"Leslie Jordan, Erin Chandler, Mark Pellegrino,...",Taken from the life story of Leslie Jordan. A ...,4.5,161,,,,,2.0,2.0
38686,tt0279809,Fatal Conflict,Fatal Conflict,2000,2000-11-10,"Action, Drama, Sci-Fi",92,"Czech Republic, Canada",English,Lloyd A. Simandl,...,"Kari Wuhrer, Jennifer Rubin, Leo Rossi, Miles ...",An ex-star pilot must stop a mad criminal and ...,3.6,220,,,,,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78770,tt7246106,Artik,Artik,2019,2019-08-11,"Horror, Thriller",78,,,Tom Botchii Skowronski,...,"Chase Williamson, Jerry G. Angelo, Lauren Ashl...",A comic book obsessed serial killer teaches hi...,4.0,283,,,,,16.0,20.0
78735,tt7225434,Kaaviyyan,Kaaviyyan,2019,2019-10-18,Thriller,109,India,Tamil,Sarathy,...,"L. Srinath, Athmeeya Rajan, Shaam, Sridevi","While in the US for training, Tamil Nadu cop A...",5.6,128,,,,,105.0,1.0
78707,tt7216068,See You Soon,See You Soon,2019,2019-07-26,"Drama, Romance",107,USA,English,David Mahmoudieh,...,"Harvey Keitel, Liam McIntyre, Poppy Drayton, O...",A U.S. soccer star suffers a career-threatenin...,4.7,265,,,$ 277126,29.0,4.0,3.0
78856,tt7294400,Mirreyes contra Godinez,Mirreyes contra Godinez,2019,2019-01-25,Comedy,109,Mexico,Spanish,Chava Cartas,...,"Diana Bovio, Pablo Lyle, Roberto Aguire, Danie...","Tells the story of Genaro Rodríguez, a young g...",5.2,846,MXN 28000000,,$ 11940579,,4.0,1.0


In [19]:
# Create a filtered dataframe from specific columns
directormovie_cols = ["imdb_title_id", "title", "director"]

directormovie_transformed= movies_filtered[directormovie_cols].copy()

directormovie_transformed.head()

Unnamed: 0,imdb_title_id,title,director
32874,tt0162711,Trixie,Alan Rudolph
32948,tt0163983,Bless the Child,Chuck Russell
35035,tt0201726,The Last Producer,Burt Reynolds
35036,tt0201737,Lost in the Pershing Point Hotel,Julia Jay Pierrepont III
38686,tt0279809,Fatal Conflict,Lloyd A. Simandl


In [20]:
directormovie_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43881 entries, 32874 to 81272
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   imdb_title_id  43881 non-null  object
 1   title          43881 non-null  object
 2   director       43822 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [21]:
# Comma separated value in the column transform into multiple rows

directormoviecleancsv= directormovie_transformed.set_index('imdb_title_id').director.str.split(', ', expand=True).stack().reset_index('imdb_title_id')
directormoviecleancsv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46947 entries, 0 to 0
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   imdb_title_id  46947 non-null  object
 1   0              46947 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [22]:
directormoviecleancsv.info()
#directormoviecleancsv.to_csv("Resources/directormoviecleancsv.csv", encoding='utf-8', index=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46947 entries, 0 to 0
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   imdb_title_id  46947 non-null  object
 1   0              46947 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [23]:
# Rename column name

directormovie_renamed= directormoviecleancsv.rename(columns={0:"name"}).copy()


directormovie_renamed

Unnamed: 0,imdb_title_id,name
0,tt0162711,Alan Rudolph
0,tt0163983,Chuck Russell
0,tt0201726,Burt Reynolds
0,tt0201737,Julia Jay Pierrepont III
0,tt0279809,Lloyd A. Simandl
...,...,...
0,tt7246106,Tom Botchii Skowronski
0,tt7225434,Sarathy
0,tt7216068,David Mahmoudieh
0,tt7294400,Chava Cartas


In [24]:
# merge with actor table table to get actor ID 

directormovie_merge= pd.merge(directormovie_renamed,director_FB_merge, on="name")
directormovie_merge

Unnamed: 0,imdb_title_id,name,director_facebook_likes,director_id
0,tt0162711,Alan Rudolph,1,710
1,tt0243991,Alan Rudolph,1,710
2,tt0314630,Alan Rudolph,1,710
3,tt5813010,Alan Rudolph,1,710
4,tt0163983,Chuck Russell,6,5973
...,...,...,...,...
35623,tt7313348,Britt Poulton,0,4451
35624,tt7298886,György Mór Kárpáti,0,12000
35625,tt7281538,Adam Dick,0,213
35626,tt7272948,Sam Hardy,0,29292


In [25]:
directormovie_merge.drop(['name', 'director_facebook_likes'], axis=1, inplace=True)
directormovie_merge

Unnamed: 0,imdb_title_id,director_id
0,tt0162711,710
1,tt0243991,710
2,tt0314630,710
3,tt5813010,710
4,tt0163983,5973
...,...,...
35623,tt7313348,4451
35624,tt7298886,12000
35625,tt7281538,213
35626,tt7272948,29292


In [26]:
#directormovie_merge.to_csv("Resources/directormovie_merge.csv", encoding='utf-8', index=True)

### Create database connection

In [27]:
connection_string = f'{username}:{password}@localhost:5432/etl_project'
engine = create_engine(f'postgresql://{connection_string}')

In [28]:
select_imdb_movies_df = pd.read_sql_query('select * from "movie"',con=engine)

In [29]:
## Extract data from movies table

#select_imdb_movies_df = pd.read_csv("Resources/movie_output.csv")
select_imdb_movies_df.head()

Unnamed: 0,movie_id,title,year,duration,budget,worlwide_gross_income,netflix,hulu,prime,disney,imdb_title_id
0,1,Trixie,2000,116,,295683.0,False,False,False,False,tt0162711
1,2,Bless the Child,2000,107,65000000.0,40443010.0,False,False,True,False,tt0163983
2,3,The Last Producer,2000,90,,,False,False,False,False,tt0201726
3,4,Lost in the Pershing Point Hotel,2000,107,,,False,False,False,False,tt0201737
4,5,Fatal Conflict,2000,92,,,False,False,True,False,tt0279809


In [30]:
#merge dataframes on "IMDB title id"
directormovie_final= pd.merge(select_imdb_movies_df,directormovie_merge, on="imdb_title_id").copy()
directormovie_final

Unnamed: 0,movie_id,title,year,duration,budget,worlwide_gross_income,netflix,hulu,prime,disney,imdb_title_id,director_id
0,1,Trixie,2000,116,,295683.0,False,False,False,False,tt0162711,710
1,2,Bless the Child,2000,107,65000000.0,40443010.0,False,False,True,False,tt0163983,5973
2,3,The Last Producer,2000,90,,,False,False,False,False,tt0201726,4625
3,4,Lost in the Pershing Point Hotel,2000,107,,,False,False,False,False,tt0201737,17193
4,5,Fatal Conflict,2000,92,,,False,False,True,False,tt0279809,19764
...,...,...,...,...,...,...,...,...,...,...,...,...
35623,43874,Deadcon,2019,78,100000.0,,True,False,False,False,tt7262990,5025
35624,43875,Student of the Year 2,2019,146,800000000.0,1635907.0,False,False,False,False,tt7255568,26633
35625,43876,Lucky Day,2019,99,5600000.0,52369.0,False,False,False,False,tt7248248,28493
35626,43877,Artik,2019,78,,,False,False,False,False,tt7246106,32758


In [31]:
directormovie_final= directormovie_final[["movie_id","director_id"]]


In [32]:
directormovie_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35628 entries, 0 to 35627
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   movie_id     35628 non-null  int64
 1   director_id  35628 non-null  int32
dtypes: int32(1), int64(1)
memory usage: 695.9 KB


In [33]:
# Confirm tables
engine.table_names()

['movie',
 'movie_actor',
 'actor',
 'lang_movie',
 'lang',
 'movie_genre',
 'genre',
 'movie_country',
 'country',
 'movie_drt',
 'director',
 'rating']

### Load DataFrames into database

In [34]:
director_FB_merge.to_sql(name='director', con=engine, if_exists='append', index=False)

In [35]:
directormovie_final.to_sql(name='movie_drt', con=engine, if_exists='append', index=False)