In [13]:
import pandas as pd
from sqlalchemy import create_engine
from secret import username, password

### Extract CSVs into DataFrames

In [14]:
actors_file = "etl_project_data/IMDb names.csv"
actors_df = pd.read_csv(actors_file)


In [15]:
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175715 entries, 0 to 175714
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imdb_name_id           175715 non-null  object 
 1   name                   175715 non-null  object 
 2   birth_name             175715 non-null  object 
 3   height                 30080 non-null   float64
 4   bio                    122832 non-null  object 
 5   birth_details          75229 non-null   object 
 6   birth_year             75111 non-null   float64
 7   date_of_birth          75102 non-null   object 
 8   place_of_birth         71122 non-null   object 
 9   death_details          26862 non-null   object 
 10  death_year             26838 non-null   float64
 11  date_of_death          26855 non-null   object 
 12  place_of_death         25036 non-null   object 
 13  reason_of_death        11972 non-null   object 
 14  spouses                175715 non-nu

### Transform premise DataFrame

In [16]:
# Create a filtered dataframe from specific columns
#actors_cols = ["imdb_name_id", "name", "birth_name", "height", "date_of_birth", "date_of_death", "primary_profession", "known_for_titles"]

actors_cols = ["name", "birth_name", "height", "date_of_birth", "date_of_death","primary_profession"]

actors_transformed= actors_df[actors_cols].copy()

In [17]:
# if actor gender is male, if actress is female


In [18]:
# Comma separated value in the column transform into multiple columns

actorscleancsv = pd.concat([actors_transformed, actors_transformed["primary_profession"].str.split(',', expand=True)], axis=1)


In [19]:
# Rename the column headers
actorscleancsv = actorscleancsv.rename(columns={"date_of_birth": "birth_date", 
                                                0  : "Profession1",
                                                1 : "Profession2",
                                                2: "Profession3",
                                                          "date_of_death": "death_date"})


actorscleancsv.drop('primary_profession', axis=1, inplace=True)


In [20]:
# Retrieve only rows where profession is actor or actress

filter1 = actorscleancsv['Profession1'].isin(["actor", "actress"]) 
filter2 = actorscleancsv['Profession2'].isin(["actor", "actress"])  
filter3 = actorscleancsv['Profession3'].isin(["actor", "actress"])  
  
# displaying data with both filter applied and mandatory  
actors_filtered = actorscleancsv.loc[(filter1) | (filter2) | (filter3)] 

In [21]:
actorscleancsv.drop(['Profession1', 'Profession2', 'Profession3'], axis=1, inplace=True)


In [22]:
actorscleancsv['actor_id'] = range(1, len(actorscleancsv)+1)
actorscleancsv.head()


Unnamed: 0,name,birth_name,height,birth_date,death_date,actor_id
0,Fred Astaire,Frederic Austerlitz Jr.,177.0,1899-05-10,1987-06-22,1
1,Lauren Bacall,Betty Joan Perske,174.0,1924-09-16,2014-08-12,2
2,Brigitte Bardot,Brigitte Bardot,166.0,1934-09-28,,3
3,John Belushi,John Adam Belushi,173.0,1949-01-24,1982-03-05,4
4,Ingmar Bergman,Ernst Ingmar Bergman,179.0,1918-07-14,2007-07-30,5


In [23]:
actorscleancsv['birth_date'] = pd.to_datetime(actorscleancsv['birth_date'], format='%Y-%m-%d', errors='coerce')
actorscleancsv['death_date'] = pd.to_datetime(actorscleancsv['death_date'], format='%Y-%m-%d', errors='coerce')
actorscleancsv.dropna()


Unnamed: 0,name,birth_name,height,birth_date,death_date,actor_id
0,Fred Astaire,Frederic Austerlitz Jr.,177.0,1899-05-10,1987-06-22,1
1,Lauren Bacall,Betty Joan Perske,174.0,1924-09-16,2014-08-12,2
3,John Belushi,John Adam Belushi,173.0,1949-01-24,1982-03-05,4
4,Ingmar Bergman,Ernst Ingmar Bergman,179.0,1918-07-14,2007-07-30,5
5,Ingrid Bergman,Ingrid Bergman,175.0,1915-08-29,1982-08-29,6
...,...,...,...,...,...,...
163420,Tatsuo Inoue,Tatsuo Inoue,200.0,1941-11-08,2016-10-05,163421
166483,Salman Shah,Shahriar Chowdhury Emon,200.0,1971-09-19,1996-09-06,166484
167829,Jay Bowdy,Jay Bowdy,200.0,1983-08-26,2017-01-23,167830
169090,Aachi Manorama,Gopishantha,200.0,1937-05-26,2015-10-10,169091


In [25]:
actorscleancsv.info()
actorscleancsv.to_csv("Resources/ActorsClean.csv", encoding='utf-8', index=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175715 entries, 0 to 175714
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   name        175715 non-null  object        
 1   birth_name  175715 non-null  object        
 2   height      30080 non-null   float64       
 3   birth_date  75052 non-null   datetime64[ns]
 4   death_date  26822 non-null   datetime64[ns]
 5   actor_id    175715 non-null  int32         
dtypes: datetime64[ns](2), float64(1), int32(1), object(2)
memory usage: 7.4+ MB


# Create dataframe for Movie_actor table

In [26]:
#read IMBD Movies excel file
movies_file = "etl_project_data/IMBD Movies.xlsx"
movies_data_df = pd.read_excel(movies_file)
movies_data_df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [27]:
#filter data from 2000 to 2019
movies_data_df.sort_values(by=['year'], ascending=True, inplace=True)
movies_filtered= movies_data_df[~(movies_data_df['year'] < 2000)]
movies_filtered

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
32874,tt0162711,Trixie,Trixie,2000,2000-08-18,"Comedy, Crime, Mystery",116,USA,English,Alan Rudolph,...,"Emily Watson, Dermot Mulroney, Nick Nolte, Nat...","An eccentric, unconventional woman whose naive...",5.0,1376,,295683.0,295683.0,26.0,35.0,37.0
32948,tt0163983,Bless the Child,Bless the Child,2000,2001-01-05,"Crime, Drama, Horror",107,"USA, Germany",English,Chuck Russell,...,"Kim Basinger, Jimmy Smits, Holliston Coleman, ...","Cody, a little girl abandoned by her mother an...",5.1,13440,$ 65000000,29381494.0,40443010.0,17.0,193.0,103.0
35035,tt0201726,The Last Producer,The Last Producer,2000,2000-08-22,"Comedy, Drama",90,USA,English,Burt Reynolds,...,"Sean Astin, David Atkinson, Leslie Bega, Benja...",An old-time mogul struggles to reenter the clu...,4.7,265,,,,,9.0,3.0
35036,tt0201737,Lost in the Pershing Point Hotel,Lost in the Pershing Point Hotel,2000,2000-06-09,"Comedy, Drama",107,USA,English,Julia Jay Pierrepont III,...,"Leslie Jordan, Erin Chandler, Mark Pellegrino,...",Taken from the life story of Leslie Jordan. A ...,4.5,161,,,,,2.0,2.0
38686,tt0279809,Fatal Conflict,Fatal Conflict,2000,2000-11-10,"Action, Drama, Sci-Fi",92,"Czech Republic, Canada",English,Lloyd A. Simandl,...,"Kari Wuhrer, Jennifer Rubin, Leo Rossi, Miles ...",An ex-star pilot must stop a mad criminal and ...,3.6,220,,,,,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78770,tt7246106,Artik,Artik,2019,2019-08-11,"Horror, Thriller",78,,,Tom Botchii Skowronski,...,"Chase Williamson, Jerry G. Angelo, Lauren Ashl...",A comic book obsessed serial killer teaches hi...,4.0,283,,,,,16.0,20.0
78735,tt7225434,Kaaviyyan,Kaaviyyan,2019,2019-10-18,Thriller,109,India,Tamil,Sarathy,...,"L. Srinath, Athmeeya Rajan, Shaam, Sridevi","While in the US for training, Tamil Nadu cop A...",5.6,128,,,,,105.0,1.0
78707,tt7216068,See You Soon,See You Soon,2019,2019-07-26,"Drama, Romance",107,USA,English,David Mahmoudieh,...,"Harvey Keitel, Liam McIntyre, Poppy Drayton, O...",A U.S. soccer star suffers a career-threatenin...,4.7,265,,,277126.0,29.0,4.0,3.0
78856,tt7294400,Mirreyes contra Godinez,Mirreyes contra Godinez,2019,2019-01-25,Comedy,109,Mexico,Spanish,Chava Cartas,...,"Diana Bovio, Pablo Lyle, Roberto Aguire, Danie...","Tells the story of Genaro Rodríguez, a young g...",5.2,846,MXN 28000000,,11940579.0,,4.0,1.0


In [28]:
# Create a filtered dataframe from specific columns
actormovie_cols = ["imdb_title_id", "title", "actors"]

actormovie_transformed= movies_filtered[actormovie_cols].copy()

actormovie_transformed.head()

Unnamed: 0,imdb_title_id,title,actors
32874,tt0162711,Trixie,"Emily Watson, Dermot Mulroney, Nick Nolte, Nat..."
32948,tt0163983,Bless the Child,"Kim Basinger, Jimmy Smits, Holliston Coleman, ..."
35035,tt0201726,The Last Producer,"Sean Astin, David Atkinson, Leslie Bega, Benja..."
35036,tt0201737,Lost in the Pershing Point Hotel,"Leslie Jordan, Erin Chandler, Mark Pellegrino,..."
38686,tt0279809,Fatal Conflict,"Kari Wuhrer, Jennifer Rubin, Leo Rossi, Miles ..."


In [29]:
# Comma separated value in the column transform into multiple rows

actormoviecleancsv= actormovie_transformed.set_index('imdb_title_id').actors.str.split(', ', expand=True).stack().reset_index('imdb_title_id')
actormoviecleancsv.head()

Unnamed: 0,imdb_title_id,0
0,tt0162711,Emily Watson
1,tt0162711,Dermot Mulroney
2,tt0162711,Nick Nolte
3,tt0162711,Nathan Lane
4,tt0162711,Brittany Murphy


In [30]:
actormoviecleancsv.info()
actormoviecleancsv.to_csv("Resources/ActorMovieClean.csv", encoding='utf-8', index=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 591357 entries, 0 to 14
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   imdb_title_id  591357 non-null  object
 1   0              591357 non-null  object
dtypes: object(2)
memory usage: 13.5+ MB


In [31]:
# Rename column name

actormovie_renamed= actormoviecleancsv.rename(columns={0:"name"}).copy()


actormovie_renamed

Unnamed: 0,imdb_title_id,name
0,tt0162711,Emily Watson
1,tt0162711,Dermot Mulroney
2,tt0162711,Nick Nolte
3,tt0162711,Nathan Lane
4,tt0162711,Brittany Murphy
...,...,...
10,tt9914286,Arcan Bunial
11,tt9914286,Seval Hislisoy
12,tt9914286,Ergül Çolakoglu
13,tt9914286,Gülçin Ugur


In [32]:
# merge with actor table table to get actor ID 

actormovie_merge= pd.merge(actormovie_renamed,actorscleancsv, on="name")
actormovie_merge

Unnamed: 0,imdb_title_id,name,birth_name,height,birth_date,death_date,actor_id
0,tt0162711,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,1757
1,tt0211492,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,1757
2,tt0289765,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,1757
3,tt0272338,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,1757
4,tt0238380,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,1757
...,...,...,...,...,...,...,...
306523,tt7272948,Scarlett Davies,Scarlett Davies,,NaT,NaT,174137
306524,tt7272948,Maria Louis,Maria Louis,,NaT,NaT,159189
306525,tt7262990,Kherrington Briggs,Kherrington Briggs,,NaT,NaT,174292
306526,tt7262990,Mai Brunelle,Mai Brunelle,200.0,NaT,NaT,168433


In [33]:
actormovie_merge.drop(['birth_name', 'height', 'birth_date', 'death_date'], axis=1, inplace=True)
actormovie_merge

Unnamed: 0,imdb_title_id,name,actor_id
0,tt0162711,Emily Watson,1757
1,tt0211492,Emily Watson,1757
2,tt0289765,Emily Watson,1757
3,tt0272338,Emily Watson,1757
4,tt0238380,Emily Watson,1757
...,...,...,...
306523,tt7272948,Scarlett Davies,174137
306524,tt7272948,Maria Louis,159189
306525,tt7262990,Kherrington Briggs,174292
306526,tt7262990,Mai Brunelle,168433


In [34]:
actormovie_merge.to_csv("Resources/actormovie_merge.csv", encoding='utf-8', index=True)

In [35]:
## Extract data from movies table

select_imdb_movies_df = pd.read_csv("Resources/movie_output.csv")
select_imdb_movies_df.head()

Unnamed: 0.1,Unnamed: 0,title,year,duration,budget,worlwide_gross_income,imdb_title_id,movie_id
0,32874,Trixie,2000,116,,295683.0,tt0162711,1
1,32948,Bless the Child,2000,107,$ 65000000,40443010.0,tt0163983,2
2,35035,The Last Producer,2000,90,,,tt0201726,3
3,35036,Lost in the Pershing Point Hotel,2000,107,,,tt0201737,4
4,38686,Fatal Conflict,2000,92,,,tt0279809,5


In [36]:
#merge dataframes on "IMDB title id"
actormovie_final= pd.merge(select_imdb_movies_df,actormovie_merge, on="imdb_title_id").copy()
actormovie_final

Unnamed: 0.1,Unnamed: 0,title,year,duration,budget,worlwide_gross_income,imdb_title_id,movie_id,name,actor_id
0,32874,Trixie,2000,116,,295683.0,tt0162711,1,Emily Watson,1757
1,32874,Trixie,2000,116,,295683.0,tt0162711,1,Dermot Mulroney,538
2,32874,Trixie,2000,116,,295683.0,tt0162711,1,Nick Nolte,547
3,32874,Trixie,2000,116,,295683.0,tt0162711,1,Nathan Lane,1385
4,32874,Trixie,2000,116,,295683.0,tt0162711,1,Brittany Murphy,3293
...,...,...,...,...,...,...,...,...,...,...
306523,78856,Mirreyes contra Godinez,2019,109,MXN 28000000,11940579.0,tt7294400,43880,Diana Bovio,144189
306524,81272,Sokagin Çocuklari,2019,98,,2833.0,tt9914286,43881,Metin Keçeci,130413
306525,81272,Sokagin Çocuklari,2019,98,,2833.0,tt9914286,43881,Orhan Aydin,95358
306526,81272,Sokagin Çocuklari,2019,98,,2833.0,tt9914286,43881,Orhan Aydin,95910


In [37]:
actormovie_final= actormovie_final[["movie_id","actor_id"]]


In [38]:
actormovie_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306528 entries, 0 to 306527
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   movie_id  306528 non-null  int64
 1   actor_id  306528 non-null  int32
dtypes: int32(1), int64(1)
memory usage: 5.8 MB


### Create database connection

In [39]:
connection_string = f'{username}:{password}@localhost:5432/ETL_project_movies'
engine = create_engine(f'postgresql://{connection_string}')

In [40]:
# Confirm tables
engine.table_names()

['actor',
 'movie_actor',
 'movie',
 'lang_movie',
 'lang',
 'movie_genre',
 'genre',
 'movie_country',
 'country',
 'movie_drt',
 'director',
 'rating']

### Load DataFrames into database

In [41]:
actorscleancsv.to_sql(name='actor', con=engine, if_exists='append', index=False)

In [42]:
actormovie_final.to_sql(name='movie_actor', con=engine, if_exists='append', index=False)