In [1]:
import pandas as pd
import os
from sqlalchemy import create_engine
import sys
sys.path.append("Resources/")
import config as c
import warnings
warnings.filterwarnings('ignore')

In [2]:
IMDB_movies = os.path.join("Resources/IMDb_movies.csv")

#create dataframe
IMDB_movies_df = pd.read_csv(IMDB_movies, low_memory=False)

#print data 
IMDB_movies_df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,12/26/06,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,"$2,250",,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,8/19/11,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,11/13/12,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,"$45,000",,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,3/6/11,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [3]:
#select needed columns
imbd_df = IMDB_movies_df[['imdb_title_id','title','original_title','year','genre','duration','country']]
imbd_df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,genre,duration,country
0,tt0000009,Miss Jerry,Miss Jerry,1894,Romance,45,USA
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,Drama,53,"Germany, Denmark"
3,tt0002101,Cleopatra,Cleopatra,1912,"Drama, History",100,USA
4,tt0002130,L'Inferno,L'Inferno,1911,"Adventure, Drama, Fantasy",68,Italy


In [4]:
#remove any null values 
imbd_df2=imbd_df.dropna()
imbd_df2.head()

Unnamed: 0,imdb_title_id,title,original_title,year,genre,duration,country
0,tt0000009,Miss Jerry,Miss Jerry,1894,Romance,45,USA
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,Drama,53,"Germany, Denmark"
3,tt0002101,Cleopatra,Cleopatra,1912,"Drama, History",100,USA
4,tt0002130,L'Inferno,L'Inferno,1911,"Adventure, Drama, Fantasy",68,Italy


In [5]:
#add a new column to dataframe that combines the year and movie title and for new movie_id
imbd_df2.insert(5,'unique_id','')
imbd_df2.insert(6, 'movie_title','')
imbd_df2.head()

Unnamed: 0,imdb_title_id,title,original_title,year,genre,unique_id,movie_title,duration,country
0,tt0000009,Miss Jerry,Miss Jerry,1894,Romance,,,45,USA
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",,,70,Australia
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,Drama,,,53,"Germany, Denmark"
3,tt0002101,Cleopatra,Cleopatra,1912,"Drama, History",,,100,USA
4,tt0002130,L'Inferno,L'Inferno,1911,"Adventure, Drama, Fantasy",,,68,Italy


In [7]:
imbd_df2.dtypes

imdb_title_id     object
title             object
original_title    object
year               int64
genre             object
unique_id         object
movie_title       object
duration           int64
country           object
dtype: object

In [10]:
#concat year and title to create formatted movie_title
#find out what error message means
imbd_df2['movie_title']=imbd_df2['year'].astype(str).str.cat(imbd_df2['title'],sep="-")
imbd_df2.head()

Unnamed: 0,imdb_title_id,title,original_title,year,genre,unique_id,movie_title,duration,country
0,tt0000009,Miss Jerry,Miss Jerry,1894,Romance,,1894-Miss Jerry,45,USA
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",,1906-The Story of the Kelly Gang,70,Australia
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,Drama,,1911-Den sorte drøm,53,"Germany, Denmark"
3,tt0002101,Cleopatra,Cleopatra,1912,"Drama, History",,1912-Cleopatra,100,USA
4,tt0002130,L'Inferno,L'Inferno,1911,"Adventure, Drama, Fantasy",,1911-L'Inferno,68,Italy


In [11]:
#use hash function to generate unique movie id based on movie title
imbd_df2['unique_id']=imbd_df2['imdb_title_id'].apply(hash)
imbd_df2.head()

Unnamed: 0,imdb_title_id,title,original_title,year,genre,unique_id,movie_title,duration,country
0,tt0000009,Miss Jerry,Miss Jerry,1894,Romance,2715549467657256263,1894-Miss Jerry,45,USA
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",-2643545104634104423,1906-The Story of the Kelly Gang,70,Australia
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,Drama,1441223756164931503,1911-Den sorte drøm,53,"Germany, Denmark"
3,tt0002101,Cleopatra,Cleopatra,1912,"Drama, History",-7003542871708066215,1912-Cleopatra,100,USA
4,tt0002130,L'Inferno,L'Inferno,1911,"Adventure, Drama, Fantasy",2714259647975847896,1911-L'Inferno,68,Italy


In [12]:
#rearrange column order
imdb=imbd_df2.reindex(columns=['unique_id','movie_title','imdb_title_id',
                                  'title','original_title','year',
                                  'genre','duration','country'])
imdb.head()

Unnamed: 0,unique_id,movie_title,imdb_title_id,title,original_title,year,genre,duration,country
0,2715549467657256263,1894-Miss Jerry,tt0000009,Miss Jerry,Miss Jerry,1894,Romance,45,USA
1,-2643545104634104423,1906-The Story of the Kelly Gang,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia
2,1441223756164931503,1911-Den sorte drøm,tt0001892,Den sorte drøm,Den sorte drøm,1911,Drama,53,"Germany, Denmark"
3,-7003542871708066215,1912-Cleopatra,tt0002101,Cleopatra,Cleopatra,1912,"Drama, History",100,USA
4,2714259647975847896,1911-L'Inferno,tt0002130,L'Inferno,L'Inferno,1911,"Adventure, Drama, Fantasy",68,Italy


In [13]:
imdb.dtypes

unique_id          int64
movie_title       object
imdb_title_id     object
title             object
original_title    object
year               int64
genre             object
duration           int64
country           object
dtype: object

In [14]:
imdb.set_index("unique_id", inplace=True)
imdb

Unnamed: 0_level_0,movie_title,imdb_title_id,title,original_title,year,genre,duration,country
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2715549467657256263,1894-Miss Jerry,tt0000009,Miss Jerry,Miss Jerry,1894,Romance,45,USA
-2643545104634104423,1906-The Story of the Kelly Gang,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia
1441223756164931503,1911-Den sorte drøm,tt0001892,Den sorte drøm,Den sorte drøm,1911,Drama,53,"Germany, Denmark"
-7003542871708066215,1912-Cleopatra,tt0002101,Cleopatra,Cleopatra,1912,"Drama, History",100,USA
2714259647975847896,1911-L'Inferno,tt0002130,L'Inferno,L'Inferno,1911,"Adventure, Drama, Fantasy",68,Italy
...,...,...,...,...,...,...,...,...
-7510570307468690752,2020-Le lion,tt9908390,Le lion,Le lion,2020,Comedy,95,"France, Belgium"
-395167703959321959,2020-De Beentjes van Sint-Hildegard,tt9911196,De Beentjes van Sint-Hildegard,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",103,Netherlands
-6924814916534830000,2019-Padmavyuhathile Abhimanyu,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,Drama,130,India
-5056261346096702544,2019-Sokagin Çocuklari,tt9914286,Sokagin Çocuklari,Sokagin Çocuklari,2019,"Drama, Family",98,Turkey


In [15]:
#create connection to database
imdb_connection = f"postgresql://{c.username}:{c.password}@localhost:5433/imdb_db"

#set engine
engine = create_engine(imdb_connection)

In [16]:
#verify engine connection
engine

Engine(postgresql://etladmin:***@localhost:5433/imdb_db)

In [None]:
#imdb.to_csv("Resources/Movies_gb.csv")

In [17]:
#export to csv 
imdb.to_sql(name="imdb_movies", con=engine, if_exists="append", index=True, method="multi")

In [None]:
pd.read_sql("Select * from imdb_movies", con=engine)