In [1]:
import pandas as pd
from sqlalchemy import create_engine
from secret import username, password

### Extract CSVs into DataFrames

In [2]:
actors_file = "../etl_project_data/IMDb names.csv"
actors_df = pd.read_csv(actors_file)


In [3]:
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175715 entries, 0 to 175714
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   imdb_name_id           175715 non-null  object 
 1   name                   175715 non-null  object 
 2   birth_name             175715 non-null  object 
 3   height                 30080 non-null   float64
 4   bio                    122832 non-null  object 
 5   birth_details          75229 non-null   object 
 6   birth_year             75111 non-null   float64
 7   date_of_birth          75102 non-null   object 
 8   place_of_birth         71122 non-null   object 
 9   death_details          26862 non-null   object 
 10  death_year             26838 non-null   float64
 11  date_of_death          26855 non-null   object 
 12  place_of_death         25036 non-null   object 
 13  reason_of_death        11972 non-null   object 
 14  spouses                175715 non-nu

### Transform premise DataFrame

In [4]:
# Create a filtered dataframe from specific columns

actors_cols = ["name", "birth_name", "height", "date_of_birth", "date_of_death","primary_profession"]

actors_transformed= actors_df[actors_cols].copy()

In [5]:
actors_transformed.head()

Unnamed: 0,name,birth_name,height,date_of_birth,date_of_death,primary_profession
0,Fred Astaire,Frederic Austerlitz Jr.,177.0,1899-05-10,1987-06-22,"soundtrack,actor,miscellaneous"
1,Lauren Bacall,Betty Joan Perske,174.0,1924-09-16,2014-08-12,"actress,soundtrack"
2,Brigitte Bardot,Brigitte Bardot,166.0,1934-09-28,,"actress,soundtrack,producer"
3,John Belushi,John Adam Belushi,173.0,1949-01-24,1982-03-05,"actor,writer,soundtrack"
4,Ingmar Bergman,Ernst Ingmar Bergman,179.0,1918-07-14,2007-07-30,"writer,director,actor"


In [6]:
# normalize profession
actorscleancsv= actors_transformed.set_index('name').primary_profession.str.split(',', expand=True).stack().reset_index('name')



In [7]:
# Rename the column to name
actorscleancsv = actorscleancsv.rename(columns={0:"Profession"}).copy()
actorscleancsv

Unnamed: 0,name,Profession
0,Fred Astaire,soundtrack
1,Fred Astaire,actor
2,Fred Astaire,miscellaneous
0,Lauren Bacall,actress
1,Lauren Bacall,soundtrack
...,...,...
0,McMagic Cardenas,actor
0,Rashaduzzman Shohag,editorial_department
0,William Plyler,producer
1,William Plyler,actor


In [8]:
#merge dataframes on "name"
actors_merge= pd.merge(actors_transformed,actorscleancsv, on="name").copy()
#actors_merge= countr_movie_merge[["movie_id","title",0]]
actors_merge.info(15)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 380845 entries, 0 to 380844
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   name                380845 non-null  object 
 1   birth_name          380845 non-null  object 
 2   height              67386 non-null   float64
 3   date_of_birth       168817 non-null  object 
 4   date_of_death       57426 non-null   object 
 5   primary_profession  380804 non-null  object 
 6   Profession          380845 non-null  object 
dtypes: float64(1), object(6)
memory usage: 23.2+ MB


In [9]:
# Filter rows with profession as actor or actress

actors_filtered = actors_merge.loc[(actors_merge['Profession'] == "actor") | (actors_merge['Profession'] == "actress") ]
actors_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96416 entries, 1 to 380843
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                96416 non-null  object 
 1   birth_name          96416 non-null  object 
 2   height              27444 non-null  float64
 3   date_of_birth       50802 non-null  object 
 4   date_of_death       15643 non-null  object 
 5   primary_profession  96409 non-null  object 
 6   Profession          96416 non-null  object 
dtypes: float64(1), object(6)
memory usage: 5.9+ MB


In [10]:
# Determine geneder based on profession

def gender(row):
    if row['Profession'] == "actor":
        val = "Male"
    else:
        val= "Female"
    return val
    
actors_filtered['gender'] = actors_filtered.apply(gender, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actors_filtered['gender'] = actors_filtered.apply(gender, axis=1)


In [11]:
actors_clean = actors_filtered[["name", "birth_name", "height", "date_of_birth", "date_of_death", "gender"]].copy()
# renaming columns"
actors_clean= actors_clean.rename(columns={"date_of_birth":"birth_date", "date_of_death":"death_date"}).copy()

actors_clean

Unnamed: 0,name,birth_name,height,birth_date,death_date,gender
1,Fred Astaire,Frederic Austerlitz Jr.,177.0,1899-05-10,1987-06-22,Male
3,Lauren Bacall,Betty Joan Perske,174.0,1924-09-16,2014-08-12,Female
5,Brigitte Bardot,Brigitte Bardot,166.0,1934-09-28,,Female
8,John Belushi,John Adam Belushi,173.0,1949-01-24,1982-03-05,Male
13,Ingmar Bergman,Ernst Ingmar Bergman,179.0,1918-07-14,2007-07-30,Male
...,...,...,...,...,...,...
380830,Warisara Yu,Warisara Yu,,,,Female
380831,Apiwich Reardon,Apiwich Reardon,,,,Male
380836,Divyansha Kaushik,Divyansha Kaushik,,,,Female
380840,McMagic Cardenas,McMagic Cardenas,,,,Male


In [12]:
actors_clean['birth_date'] = pd.to_datetime(actors_clean['birth_date'], format='%Y-%m-%d', errors='coerce')
actors_clean['death_date'] = pd.to_datetime(actors_clean['death_date'], format='%Y-%m-%d', errors='coerce')
#actors_clean.dropna()


In [13]:
actors_clean.info()
#actors_clean.to_csv("../Resources/ActorsClean.csv", encoding='utf-8', index=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96416 entries, 1 to 380843
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   name        96416 non-null  object        
 1   birth_name  96416 non-null  object        
 2   height      27444 non-null  float64       
 3   birth_date  50793 non-null  datetime64[ns]
 4   death_date  15643 non-null  datetime64[ns]
 5   gender      96416 non-null  object        
dtypes: datetime64[ns](2), float64(1), object(3)
memory usage: 5.1+ MB


### Extract actor facebook likes from metadata csv file


In [14]:
# Import the metadata CSV files

metadata_file = "../etl_project_data/movie_metadata.csv"
metadata_df = pd.read_csv(metadata_file)

In [15]:
 ## Create a filtered dataframe from specific columns

metadata_cols = ["actor_1_name","actor_1_facebook_likes"]

metadata_transformed= metadata_df[metadata_cols].copy()

In [16]:
# Rename the column to name
metadata_transformed = metadata_transformed.rename(columns={"actor_1_name":"name", "actor_1_facebook_likes" : "actor_facebook_likes"}).copy()
metadata_transformed

Unnamed: 0,name,actor_facebook_likes
0,CCH Pounder,1000.0
1,Johnny Depp,40000.0
2,Christoph Waltz,11000.0
3,Tom Hardy,27000.0
4,Doug Walker,131.0
...,...,...
5038,Eric Mabius,637.0
5039,Natalie Zea,841.0
5040,Eva Boehnke,0.0
5041,Alan Ruck,946.0


In [17]:
## left join to get actor facebook likes

actor_FB_merge = pd.merge(left=actors_clean, right=metadata_transformed, how='left', left_on='name', right_on='name')
actor_FB_merge

Unnamed: 0,name,birth_name,height,birth_date,death_date,gender,actor_facebook_likes
0,Fred Astaire,Frederic Austerlitz Jr.,177.0,1899-05-10,1987-06-22,Male,
1,Lauren Bacall,Betty Joan Perske,174.0,1924-09-16,2014-08-12,Female,
2,Brigitte Bardot,Brigitte Bardot,166.0,1934-09-28,NaT,Female,984.0
3,John Belushi,John Adam Belushi,173.0,1949-01-24,1982-03-05,Male,1000.0
4,John Belushi,John Adam Belushi,173.0,1949-01-24,1982-03-05,Male,1000.0
...,...,...,...,...,...,...,...
99592,Warisara Yu,Warisara Yu,,NaT,NaT,Female,
99593,Apiwich Reardon,Apiwich Reardon,,NaT,NaT,Male,
99594,Divyansha Kaushik,Divyansha Kaushik,,NaT,NaT,Female,
99595,McMagic Cardenas,McMagic Cardenas,,NaT,NaT,Male,


In [18]:
actor_FB_merge['actor_id'] = range(1, len(actor_FB_merge)+1)
actor_FB_merge.tail()


Unnamed: 0,name,birth_name,height,birth_date,death_date,gender,actor_facebook_likes,actor_id
99592,Warisara Yu,Warisara Yu,,NaT,NaT,Female,,99593
99593,Apiwich Reardon,Apiwich Reardon,,NaT,NaT,Male,,99594
99594,Divyansha Kaushik,Divyansha Kaushik,,NaT,NaT,Female,,99595
99595,McMagic Cardenas,McMagic Cardenas,,NaT,NaT,Male,,99596
99596,William Plyler,William Plyler,,NaT,NaT,Male,,99597


In [19]:
#actor_FB_merge.to_csv("../Resources/actor_FB_merge.csv", encoding='utf-8', index=True)

# Create dataframe for Movie_actor table

In [20]:
# Import IMDB Movies data
xlsx_file = "../etl_project_data/IMDb movies.csv"
movies_data_df = pd.read_csv(xlsx_file)
movies_data_df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [21]:
#filter data from 2000 to 2019
movies_data_df.sort_values(by=['year'], ascending=True, inplace=True)
movies_filtered= movies_data_df[~(movies_data_df['year'] < 2000)]
movies_filtered

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
32874,tt0162711,Trixie,Trixie,2000,2000-08-18,"Comedy, Crime, Mystery",116,USA,English,Alan Rudolph,...,"Emily Watson, Dermot Mulroney, Nick Nolte, Nat...","An eccentric, unconventional woman whose naive...",5.0,1376,,$ 295683,$ 295683,26.0,35.0,37.0
32948,tt0163983,Bless the Child,Bless the Child,2000,2001-01-05,"Crime, Drama, Horror",107,"USA, Germany",English,Chuck Russell,...,"Kim Basinger, Jimmy Smits, Holliston Coleman, ...","Cody, a little girl abandoned by her mother an...",5.1,13440,$ 65000000,$ 29381494,$ 40443010,17.0,193.0,103.0
35035,tt0201726,The Last Producer,The Last Producer,2000,2000-08-22,"Comedy, Drama",90,USA,English,Burt Reynolds,...,"Sean Astin, David Atkinson, Leslie Bega, Benja...",An old-time mogul struggles to reenter the clu...,4.7,265,,,,,9.0,3.0
35036,tt0201737,Lost in the Pershing Point Hotel,Lost in the Pershing Point Hotel,2000,2000-06-09,"Comedy, Drama",107,USA,English,Julia Jay Pierrepont III,...,"Leslie Jordan, Erin Chandler, Mark Pellegrino,...",Taken from the life story of Leslie Jordan. A ...,4.5,161,,,,,2.0,2.0
38686,tt0279809,Fatal Conflict,Fatal Conflict,2000,2000-11-10,"Action, Drama, Sci-Fi",92,"Czech Republic, Canada",English,Lloyd A. Simandl,...,"Kari Wuhrer, Jennifer Rubin, Leo Rossi, Miles ...",An ex-star pilot must stop a mad criminal and ...,3.6,220,,,,,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78770,tt7246106,Artik,Artik,2019,2019-08-11,"Horror, Thriller",78,,,Tom Botchii Skowronski,...,"Chase Williamson, Jerry G. Angelo, Lauren Ashl...",A comic book obsessed serial killer teaches hi...,4.0,283,,,,,16.0,20.0
78735,tt7225434,Kaaviyyan,Kaaviyyan,2019,2019-10-18,Thriller,109,India,Tamil,Sarathy,...,"L. Srinath, Athmeeya Rajan, Shaam, Sridevi","While in the US for training, Tamil Nadu cop A...",5.6,128,,,,,105.0,1.0
78707,tt7216068,See You Soon,See You Soon,2019,2019-07-26,"Drama, Romance",107,USA,English,David Mahmoudieh,...,"Harvey Keitel, Liam McIntyre, Poppy Drayton, O...",A U.S. soccer star suffers a career-threatenin...,4.7,265,,,$ 277126,29.0,4.0,3.0
78856,tt7294400,Mirreyes contra Godinez,Mirreyes contra Godinez,2019,2019-01-25,Comedy,109,Mexico,Spanish,Chava Cartas,...,"Diana Bovio, Pablo Lyle, Roberto Aguire, Danie...","Tells the story of Genaro Rodríguez, a young g...",5.2,846,MXN 28000000,,$ 11940579,,4.0,1.0


In [22]:
# Create a filtered dataframe from specific columns
actormovie_cols = ["imdb_title_id", "title", "actors"]

actormovie_transformed= movies_filtered[actormovie_cols].copy()

actormovie_transformed.head()

Unnamed: 0,imdb_title_id,title,actors
32874,tt0162711,Trixie,"Emily Watson, Dermot Mulroney, Nick Nolte, Nat..."
32948,tt0163983,Bless the Child,"Kim Basinger, Jimmy Smits, Holliston Coleman, ..."
35035,tt0201726,The Last Producer,"Sean Astin, David Atkinson, Leslie Bega, Benja..."
35036,tt0201737,Lost in the Pershing Point Hotel,"Leslie Jordan, Erin Chandler, Mark Pellegrino,..."
38686,tt0279809,Fatal Conflict,"Kari Wuhrer, Jennifer Rubin, Leo Rossi, Miles ..."


In [23]:
# Comma separated value in the column transform into multiple rows

actormoviecleancsv= actormovie_transformed.set_index('imdb_title_id').actors.str.split(', ', expand=True).stack().reset_index('imdb_title_id')
actormoviecleancsv.head()

Unnamed: 0,imdb_title_id,0
0,tt0162711,Emily Watson
1,tt0162711,Dermot Mulroney
2,tt0162711,Nick Nolte
3,tt0162711,Nathan Lane
4,tt0162711,Brittany Murphy


In [24]:
actormoviecleancsv.info()
#actormoviecleancsv.to_csv("../Resources/ActorMovieClean.csv", encoding='utf-8', index=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 591357 entries, 0 to 14
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   imdb_title_id  591357 non-null  object
 1   0              591357 non-null  object
dtypes: object(2)
memory usage: 13.5+ MB


In [25]:
# Rename column name

actormovie_renamed= actormoviecleancsv.rename(columns={0:"name"}).copy()


actormovie_renamed

Unnamed: 0,imdb_title_id,name
0,tt0162711,Emily Watson
1,tt0162711,Dermot Mulroney
2,tt0162711,Nick Nolte
3,tt0162711,Nathan Lane
4,tt0162711,Brittany Murphy
...,...,...
10,tt9914286,Arcan Bunial
11,tt9914286,Seval Hislisoy
12,tt9914286,Ergül Çolakoglu
13,tt9914286,Gülçin Ugur


In [26]:
# merge with actor table table to get actor ID 

actormovie_merge= pd.merge(actormovie_renamed,actor_FB_merge, on="name")
actormovie_merge

Unnamed: 0,imdb_title_id,name,birth_name,height,birth_date,death_date,gender,actor_facebook_likes,actor_id
0,tt0162711,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,Female,876.0,3208
1,tt0162711,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,Female,876.0,3209
2,tt0162711,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,Female,876.0,3210
3,tt0162711,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,Female,876.0,3211
4,tt0211492,Emily Watson,Emily Margaret Watson,173.0,1967-01-14,NaT,Female,876.0,3208
...,...,...,...,...,...,...,...,...,...
388776,tt7272948,Scarlett Davies,Scarlett Davies,,NaT,NaT,Female,,98880
388777,tt7272948,Maria Louis,Maria Louis,,NaT,NaT,Female,,91322
388778,tt7262990,Kherrington Briggs,Kherrington Briggs,,NaT,NaT,Female,,98952
388779,tt7262990,Mai Brunelle,Mai Brunelle,200.0,NaT,NaT,Female,,96079


In [27]:
actormovie_merge.drop(['birth_name', 'height', 'birth_date', 'death_date', 'actor_facebook_likes'], axis=1, inplace=True)
actormovie_merge

Unnamed: 0,imdb_title_id,name,gender,actor_id
0,tt0162711,Emily Watson,Female,3208
1,tt0162711,Emily Watson,Female,3209
2,tt0162711,Emily Watson,Female,3210
3,tt0162711,Emily Watson,Female,3211
4,tt0211492,Emily Watson,Female,3208
...,...,...,...,...
388776,tt7272948,Scarlett Davies,Female,98880
388777,tt7272948,Maria Louis,Female,91322
388778,tt7262990,Kherrington Briggs,Female,98952
388779,tt7262990,Mai Brunelle,Female,96079


In [28]:
#actormovie_merge.to_csv("../Resources/actormovie_merge.csv", encoding='utf-8', index=True)

### Create database connection

In [29]:
connection_string = f'{username}:{password}@localhost:5432/etl_project'
engine = create_engine(f'postgresql://{connection_string}')

In [30]:
select_imdb_movies_df = pd.read_sql_query('select * from "movie"',con=engine)

In [31]:
## Extract data from movies table

#select_imdb_movies_df = pd.read_csv("../Resources/movie_output.csv")
select_imdb_movies_df.head()

Unnamed: 0,movie_id,title,year,duration,budget,worlwide_gross_income,netflix,hulu,prime,disney,imdb_title_id
0,1,Trixie,2000,116,,295683.0,False,False,False,False,tt0162711
1,2,Bless the Child,2000,107,65000000.0,40443010.0,False,False,True,False,tt0163983
2,3,The Last Producer,2000,90,,,False,False,False,False,tt0201726
3,4,Lost in the Pershing Point Hotel,2000,107,,,False,False,False,False,tt0201737
4,5,Fatal Conflict,2000,92,,,False,False,True,False,tt0279809


In [32]:
#merge dataframes on "IMDB title id"
actormovie_final= pd.merge(select_imdb_movies_df,actormovie_merge, on="imdb_title_id").copy()
actormovie_final

Unnamed: 0,movie_id,title,year,duration,budget,worlwide_gross_income,netflix,hulu,prime,disney,imdb_title_id,name,gender,actor_id
0,1,Trixie,2000,116,,295683.0,False,False,False,False,tt0162711,Emily Watson,Female,3208
1,1,Trixie,2000,116,,295683.0,False,False,False,False,tt0162711,Emily Watson,Female,3209
2,1,Trixie,2000,116,,295683.0,False,False,False,False,tt0162711,Emily Watson,Female,3210
3,1,Trixie,2000,116,,295683.0,False,False,False,False,tt0162711,Emily Watson,Female,3211
4,1,Trixie,2000,116,,295683.0,False,False,False,False,tt0162711,Dermot Mulroney,Male,1604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388776,43881,Sokagin Çocuklari,2019,98,,2833.0,False,False,False,False,tt9914286,Orhan Aydin,Male,58505
388777,43881,Sokagin Çocuklari,2019,98,,2833.0,False,False,False,False,tt9914286,Orhan Aydin,Male,58506
388778,43881,Sokagin Çocuklari,2019,98,,2833.0,False,False,False,False,tt9914286,Orhan Aydin,Male,58507
388779,43881,Sokagin Çocuklari,2019,98,,2833.0,False,False,False,False,tt9914286,Orhan Aydin,Male,58508


In [33]:
actormovie_final= actormovie_final[["movie_id","actor_id"]]


In [34]:
actormovie_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 388781 entries, 0 to 388780
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   movie_id  388781 non-null  int64
 1   actor_id  388781 non-null  int32
dtypes: int32(1), int64(1)
memory usage: 7.4 MB


In [35]:
# Confirm tables
engine.table_names()

['movie',
 'movie_actor',
 'actor',
 'lang_movie',
 'lang',
 'movie_genre',
 'genre',
 'movie_country',
 'country',
 'movie_drt',
 'director',
 'rating']

### Load DataFrames into database

In [36]:
actor_FB_merge.to_sql(name='actor', con=engine, if_exists='append', index=False)

In [38]:
actormovie_final.to_sql(name='movie_actor', con=engine, if_exists='append', index=False)