In [1]:
import pandas as pd
from sqlalchemy import create_engine
from secret import username, password

### Extract CSVs into DataFrames

In [2]:
actors_file = "etl_project_data/IMDb names.csv"
actors_df = pd.read_csv(actors_file)


In [3]:
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175715 entries, 0 to 175714
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imdb_name_id           175715 non-null  object 
 1   name                   175715 non-null  object 
 2   birth_name             175715 non-null  object 
 3   height                 30080 non-null   float64
 4   bio                    122832 non-null  object 
 5   birth_details          75229 non-null   object 
 6   birth_year             75111 non-null   float64
 7   date_of_birth          75102 non-null   object 
 8   place_of_birth         71122 non-null   object 
 9   death_details          26862 non-null   object 
 10  death_year             26838 non-null   float64
 11  date_of_death          26855 non-null   object 
 12  place_of_death         25036 non-null   object 
 13  reason_of_death        11972 non-null   object 
 14  spouses                175715 non-nu

### Transform premise DataFrame

In [4]:
# Create a filtered dataframe from specific columns
#actors_cols = ["imdb_name_id", "name", "birth_name", "height", "date_of_birth", "date_of_death", "primary_profession", "known_for_titles"]

actors_cols = ["name", "birth_name", "height", "date_of_birth", "date_of_death","primary_profession"]

actors_transformed= actors_df[actors_cols].copy()

In [None]:
# if actor gender is male, if ac

In [5]:
# Comma separated value in the column transform into multiple columns

actorscleancsv = pd.concat([actors_transformed, actors_transformed["primary_profession"].str.split(',', expand=True)], axis=1)


In [6]:
# Rename the column headers
actorscleancsv = actorscleancsv.rename(columns={"date_of_birth": "birth_date", 
                                                0  : "Profession1",
                                                1 : "Profession2",
                                                2: "Profession3",
                                                          "date_of_death": "death_date"})


actorscleancsv.drop('primary_profession', axis=1, inplace=True)


In [7]:
# Retrieve only rows where profession is actor or actress

filter1 = actorscleancsv['Profession1'].isin(["actor", "actress"]) 
filter2 = actorscleancsv['Profession2'].isin(["actor", "actress"])  
filter3 = actorscleancsv['Profession3'].isin(["actor", "actress"])  
  
# displaying data with both filter applied and mandatory  
actors_filtered = actorscleancsv.loc[(filter1) | (filter2) | (filter3)] 

In [8]:
actorscleancsv.drop(['Profession1', 'Profession2', 'Profession3'], axis=1, inplace=True)


In [9]:
actorscleancsv['actor_id'] = range(1, len(actorscleancsv)+1)
actorscleancsv.head()


Unnamed: 0,name,birth_name,height,birth_date,death_date,actor_id
0,Fred Astaire,Frederic Austerlitz Jr.,177.0,1899-05-10,1987-06-22,1
1,Lauren Bacall,Betty Joan Perske,174.0,1924-09-16,2014-08-12,2
2,Brigitte Bardot,Brigitte Bardot,166.0,1934-09-28,,3
3,John Belushi,John Adam Belushi,173.0,1949-01-24,1982-03-05,4
4,Ingmar Bergman,Ernst Ingmar Bergman,179.0,1918-07-14,2007-07-30,5


In [10]:
actorscleancsv['birth_date'] = pd.to_datetime(actorscleancsv['birth_date'], format='%Y-%m-%d', errors='coerce')
actorscleancsv['death_date'] = pd.to_datetime(actorscleancsv['death_date'], format='%Y-%m-%d', errors='coerce')
actorscleancsv.dropna()


Unnamed: 0,name,birth_name,height,birth_date,death_date,actor_id
0,Fred Astaire,Frederic Austerlitz Jr.,177.0,1899-05-10,1987-06-22,1
1,Lauren Bacall,Betty Joan Perske,174.0,1924-09-16,2014-08-12,2
3,John Belushi,John Adam Belushi,173.0,1949-01-24,1982-03-05,4
4,Ingmar Bergman,Ernst Ingmar Bergman,179.0,1918-07-14,2007-07-30,5
5,Ingrid Bergman,Ingrid Bergman,175.0,1915-08-29,1982-08-29,6
...,...,...,...,...,...,...
163420,Tatsuo Inoue,Tatsuo Inoue,200.0,1941-11-08,2016-10-05,163421
166483,Salman Shah,Shahriar Chowdhury Emon,200.0,1971-09-19,1996-09-06,166484
167829,Jay Bowdy,Jay Bowdy,200.0,1983-08-26,2017-01-23,167830
169090,Aachi Manorama,Gopishantha,200.0,1937-05-26,2015-10-10,169091


In [11]:
actorscleancsv.tail(30)

Unnamed: 0,name,birth_name,height,birth_date,death_date,actor_id
175685,Arsel Arumugam,Arsel Arumugam,,NaT,NaT,175686
175686,Gowtham,Gowtham,,NaT,NaT,175687
175687,Shanthi Anand,Shanthi Anand,,NaT,NaT,175688
175688,Choi Yeong-Hwan,Choi Yeong-Hwan,,NaT,NaT,175689
175689,Sabrina Rose,Sabrina Rose,,NaT,NaT,175690
175690,Brian Prutch,Brian Prutch,,NaT,NaT,175691
175691,Bibriti Chatterjee,Bibriti Chatterjee,,NaT,NaT,175692
175692,Kenny Wong,Kenny Wong,,NaT,NaT,175693
175693,Suzy Spade,Suzy Spade,,NaT,NaT,175694
175694,James Simone,James Simone,,NaT,NaT,175695


### Create database connection

In [12]:
connection_string = f'{username}:{password}@localhost:5432/ETL_project_movies'
engine = create_engine(f'postgresql://{connection_string}')

In [13]:
# Confirm tables
engine.table_names()

['movie',
 'movie_actor',
 'actor',
 'lang_movie',
 'lang',
 'movie_genre',
 'genre',
 'movie_country',
 'country',
 'movie_drt',
 'director',
 'rating']

### Load DataFrames into database

In [14]:
actorscleancsv.to_sql(name='actor', con=engine, if_exists='append', index=False)