In [1]:
import pandas as pd
from sqlalchemy import create_engine
from secret import username, password

### Extract CSVs into DataFrames

In [2]:
#read IMBD Movies excel file
movies_file = "etl_project_data/IMBD Movies.xlsx"
movies_data_df = pd.read_excel(movies_file)
movies_data_df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [3]:
#filter data from 2000 to 2019
movies_data_df.sort_values(by=['year'], ascending=True, inplace=True)
movies_filtered= movies_data_df[~(movies_data_df['year'] < 2000)]
movies_filtered

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
32874,tt0162711,Trixie,Trixie,2000,2000-08-18,"Comedy, Crime, Mystery",116,USA,English,Alan Rudolph,...,"Emily Watson, Dermot Mulroney, Nick Nolte, Nat...","An eccentric, unconventional woman whose naive...",5.0,1376,,295683.0,295683.0,26.0,35.0,37.0
32948,tt0163983,Bless the Child,Bless the Child,2000,2001-01-05,"Crime, Drama, Horror",107,"USA, Germany",English,Chuck Russell,...,"Kim Basinger, Jimmy Smits, Holliston Coleman, ...","Cody, a little girl abandoned by her mother an...",5.1,13440,$ 65000000,29381494.0,40443010.0,17.0,193.0,103.0
35035,tt0201726,The Last Producer,The Last Producer,2000,2000-08-22,"Comedy, Drama",90,USA,English,Burt Reynolds,...,"Sean Astin, David Atkinson, Leslie Bega, Benja...",An old-time mogul struggles to reenter the clu...,4.7,265,,,,,9.0,3.0
35036,tt0201737,Lost in the Pershing Point Hotel,Lost in the Pershing Point Hotel,2000,2000-06-09,"Comedy, Drama",107,USA,English,Julia Jay Pierrepont III,...,"Leslie Jordan, Erin Chandler, Mark Pellegrino,...",Taken from the life story of Leslie Jordan. A ...,4.5,161,,,,,2.0,2.0
38686,tt0279809,Fatal Conflict,Fatal Conflict,2000,2000-11-10,"Action, Drama, Sci-Fi",92,"Czech Republic, Canada",English,Lloyd A. Simandl,...,"Kari Wuhrer, Jennifer Rubin, Leo Rossi, Miles ...",An ex-star pilot must stop a mad criminal and ...,3.6,220,,,,,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78770,tt7246106,Artik,Artik,2019,2019-08-11,"Horror, Thriller",78,,,Tom Botchii Skowronski,...,"Chase Williamson, Jerry G. Angelo, Lauren Ashl...",A comic book obsessed serial killer teaches hi...,4.0,283,,,,,16.0,20.0
78735,tt7225434,Kaaviyyan,Kaaviyyan,2019,2019-10-18,Thriller,109,India,Tamil,Sarathy,...,"L. Srinath, Athmeeya Rajan, Shaam, Sridevi","While in the US for training, Tamil Nadu cop A...",5.6,128,,,,,105.0,1.0
78707,tt7216068,See You Soon,See You Soon,2019,2019-07-26,"Drama, Romance",107,USA,English,David Mahmoudieh,...,"Harvey Keitel, Liam McIntyre, Poppy Drayton, O...",A U.S. soccer star suffers a career-threatenin...,4.7,265,,,277126.0,29.0,4.0,3.0
78856,tt7294400,Mirreyes contra Godinez,Mirreyes contra Godinez,2019,2019-01-25,Comedy,109,Mexico,Spanish,Chava Cartas,...,"Diana Bovio, Pablo Lyle, Roberto Aguire, Danie...","Tells the story of Genaro Rodríguez, a young g...",5.2,846,MXN 28000000,,11940579.0,,4.0,1.0


### Transform to create Genre dataframe

In [4]:
# Create a filtered dataframe from specific columns
genre_cols = ["imdb_title_id", "genre"]

genre_transformed= movies_filtered[genre_cols].copy()

genre_transformed.head()

Unnamed: 0,imdb_title_id,genre
32874,tt0162711,"Comedy, Crime, Mystery"
32948,tt0163983,"Crime, Drama, Horror"
35035,tt0201726,"Comedy, Drama"
35036,tt0201737,"Comedy, Drama"
38686,tt0279809,"Action, Drama, Sci-Fi"


In [5]:
# Comma separated value in the column transform into multiple rows

genrecleancsv= genre_transformed.set_index('imdb_title_id').genre.str.split(',', expand=True).stack().reset_index('imdb_title_id')
genrecleancsv.head()

Unnamed: 0,imdb_title_id,0
0,tt0162711,Comedy
1,tt0162711,Crime
2,tt0162711,Mystery
0,tt0163983,Crime
1,tt0163983,Drama


In [6]:
genrecleancsv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88468 entries, 0 to 1
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   imdb_title_id  88468 non-null  object
 1   0              88468 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [7]:
# Rename column name

genre_renamed= genrecleancsv.rename(columns={0:"name"}).copy()
genre_renamed= genre_renamed["name"].to_frame()

genre_renamed

Unnamed: 0,name
0,Comedy
1,Crime
2,Mystery
0,Crime
1,Drama
...,...
0,Drama
1,Romance
0,Comedy
0,Drama


In [8]:
# Drop duplicate values 

genre_df= genre_renamed.groupby(["name"]).count().copy()
genre_df= genre_df.reset_index()
genre_df


Unnamed: 0,name
0,Action
1,Adventure
2,Animation
3,Biography
4,Comedy
5,Crime
6,Drama
7,Family
8,Fantasy
9,Game-Show


In [9]:
# Create unique ID for Gnere table

genre_df['genre_id'] = range(1, len(genre_df)+1)



In [10]:
genre_df

Unnamed: 0,name,genre_id
0,Action,1
1,Adventure,2
2,Animation,3
3,Biography,4
4,Comedy,5
5,Crime,6
6,Drama,7
7,Family,8
8,Fantasy,9
9,Game-Show,10


### Create dataframe for Movie Genre table

In [11]:
## Extract data from movies table

select_imdb_movies_df = pd.read_csv("Resources/movie_output.csv")
select_imdb_movies_df.head()

Unnamed: 0.1,Unnamed: 0,title,year,duration,budget,worlwide_gross_income,imdb_title_id,movie_id
0,32874,Trixie,2000,116,,295683.0,tt0162711,1
1,32948,Bless the Child,2000,107,$ 65000000,40443010.0,tt0163983,2
2,35035,The Last Producer,2000,90,,,tt0201726,3
3,35036,Lost in the Pershing Point Hotel,2000,107,,,tt0201737,4
4,38686,Fatal Conflict,2000,92,,,tt0279809,5


In [12]:
genrecleancsv

Unnamed: 0,imdb_title_id,0
0,tt0162711,Comedy
1,tt0162711,Crime
2,tt0162711,Mystery
0,tt0163983,Crime
1,tt0163983,Drama
...,...,...
0,tt7216068,Drama
1,tt7216068,Romance
0,tt7294400,Comedy
0,tt9914286,Drama


In [13]:
#merge dataframes on "IMDB title id"
genre_merge= pd.merge(select_imdb_movies_df,genrecleancsv, on="imdb_title_id").copy()
genre_merge= genre_merge[["movie_id","title",0]]
genre_merge.tail(15)

Unnamed: 0,movie_id,title,0
88453,43874,Deadcon,Horror
88454,43875,Student of the Year 2,Drama
88455,43875,Student of the Year 2,Romance
88456,43875,Student of the Year 2,Sport
88457,43876,Lucky Day,Action
88458,43876,Lucky Day,Crime
88459,43876,Lucky Day,Thriller
88460,43877,Artik,Horror
88461,43877,Artik,Thriller
88462,43878,Kaaviyyan,Thriller


In [14]:
#rename column 0 to "name"

genre_merge.rename(columns={0:"name"}, inplace= True)
genre_merge

Unnamed: 0,movie_id,title,name
0,1,Trixie,Comedy
1,1,Trixie,Crime
2,1,Trixie,Mystery
3,2,Bless the Child,Crime
4,2,Bless the Child,Drama
...,...,...,...
88463,43879,See You Soon,Drama
88464,43879,See You Soon,Romance
88465,43880,Mirreyes contra Godinez,Comedy
88466,43881,Sokagin Çocuklari,Drama


In [15]:
# merge with genre table to get genre ID

Movie_genre_final= pd.merge(genre_merge,genre_df, on="name")


In [16]:
Movie_genre_final.drop(['title', 'name'], axis=1, inplace=True)

In [17]:
Movie_genre_final.head(30)

Unnamed: 0,movie_id,genre_id
0,1,28
1,3,28
2,4,28
3,10,28
4,14,28
5,15,28
6,17,28
7,21,28
8,24,28
9,25,28


### Create database connection

In [18]:
connection_string = f'{username}:{password}@localhost:5432/ETL_project_movies'
engine = create_engine(f'postgresql://{connection_string}')

In [19]:
# Confirm tables
engine.table_names()

['movie',
 'movie_actor',
 'actor',
 'lang_movie',
 'lang',
 'movie_genre',
 'genre',
 'movie_country',
 'country',
 'movie_drt',
 'director',
 'rating']

### Load DataFrames into database

In [20]:
#Load genere table to Database

genre_df.to_sql(name='genre', con=engine, if_exists='append', index=False)

In [21]:
# Load Movie genere table to database
Movie_genre_final.to_sql(name='movie_genre', con=engine, if_exists='append', index=False)