In [1]:
# import all the things
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Load Data

In [2]:
# path to Golden Globes csv file
gg_file = "Resources/golden_globe_awards.csv"

# read in Golden Globes csv file
gg_df_orig = pd.read_csv(gg_file)
gg_df_orig.head()

Unnamed: 0,year_film,year_award,ceremony,category,nominee,film,win
0,1943,1944,1,Best Performance by an Actress in a Supporting...,Katina Paxinou,For Whom The Bell Tolls,True
1,1943,1944,1,Best Performance by an Actor in a Supporting R...,Akim Tamiroff,For Whom The Bell Tolls,True
2,1943,1944,1,Best Director - Motion Picture,Henry King,The Song Of Bernadette,True
3,1943,1944,1,Picture,The Song Of Bernadette,,True
4,1943,1944,1,Actress In A Leading Role,Jennifer Jones,The Song Of Bernadette,True


In [3]:
# path to Movies csv file
movies_file = "Resources/AllMoviesDetailsCleaned.csv"

# read in Movies csv file
movies_df_orig = pd.read_csv(movies_file, sep=';')
movies_df_orig.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,budget,genres,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,production_companies_number,production_countries_number,spoken_languages_number
0,2,0,Drama|Crime,tt0094675,fi,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,0.823904,Villealfa Filmproduction Oy,Finland,...,69.0,suomi,Released,,Ariel,7.1,40,2,1,2
1,3,0,Drama|Comedy,tt0092149,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",0.47445,Villealfa Filmproduction Oy,Finland,...,76.0,English,Released,,Shadows in Paradise,7.0,32,1,1,3
2,5,4000000,Crime|Comedy,tt0113101,en,Four Rooms,It's Ted the Bellhop's first night on the job....,1.698,Miramax Films,United States of America,...,98.0,English,Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,6.5,485,2,1,1
3,6,0,Action|Thriller|Crime,tt0107286,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",1.32287,Universal Pictures,Japan,...,110.0,English,Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.5,69,3,2,1
4,8,42000,Documentary,tt0825671,en,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,0.054716,inLoops,Austria,...,80.0,English,Released,A Megacities remix.,Life in Loops (A Megacities RMX),6.4,4,1,1,5


## Transform Data

### Golden Globes

In [4]:
# drop unnecessary columns ceremony and year_film
gg_df = gg_df_orig.drop(columns={"ceremony"})
gg_df.head()

Unnamed: 0,year_film,year_award,category,nominee,film,win
0,1943,1944,Best Performance by an Actress in a Supporting...,Katina Paxinou,For Whom The Bell Tolls,True
1,1943,1944,Best Performance by an Actor in a Supporting R...,Akim Tamiroff,For Whom The Bell Tolls,True
2,1943,1944,Best Director - Motion Picture,Henry King,The Song Of Bernadette,True
3,1943,1944,Picture,The Song Of Bernadette,,True
4,1943,1944,Actress In A Leading Role,Jennifer Jones,The Song Of Bernadette,True


In [5]:
# remove non movie based categories
gg_movies_df = gg_df[gg_df['category'].str.contains('Actor') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Actress') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Director') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('International') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Special') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Screenplay') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Song') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Score') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Cinematography') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Foreign') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Star') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Juvenile') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('International') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Color') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Henrietta') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Cecil') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('International') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Documentary') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Television') == False]
gg_movies_df = gg_movies_df[gg_movies_df['category'].str.contains('Hollywood') == False]

# drop film column and rename nominee column to film
gg_movies_df = gg_movies_df.drop(columns="film")
gg_movies_df = gg_movies_df.rename(columns={"nominee":"film"})

#drop movies after Jan 2017
gg_movies_df = gg_movies_df[gg_movies_df['year_film']<=2016]

gg_movies_df.head()

Unnamed: 0,year_film,year_award,category,film,win
3,1943,1944,Picture,The Song Of Bernadette,True
9,1944,1945,Picture,Going My Way,True
16,1945,1946,Picture,The Lost Weekend,True
24,1946,1947,Picture,The Best Years Of Our Lives,True
35,1947,1948,Picture,Gentleman's Agreement,True


In [6]:
# read the movie title list into a column for cleaning
movie_title_list = gg_movies_df["film"].tolist()

# remove any "The" at the start of the movie name
# remove any ", The" at the end of the movie name
for i in range(0, len(movie_title_list)):
    if movie_title_list[i][-5:] == ", The":
        movie_title_list[i] = movie_title_list[i][:-5]
    if movie_title_list[i][:4] == "The ":
        movie_title_list[i] = movie_title_list[i][4:]
    if movie_title_list[i][-3:] == ", A":
        movie_title_list[i] = movie_title_list[i][:-3]
    if movie_title_list[i][-4:] == ", An":
        movie_title_list[i] = movie_title_list[i][:-4]
    movie_title_list[i] = movie_title_list[i].lower()    

# rename film to old film title
gg_movies_df = gg_movies_df.rename(columns={"film":"old_film_title"})

# add cleaned film title to data frame
gg_movies_df["film"] = movie_title_list

#additional cleaning
gg_movies_df.head()

Unnamed: 0,year_film,year_award,category,old_film_title,win,film
3,1943,1944,Picture,The Song Of Bernadette,True,song of bernadette
9,1944,1945,Picture,Going My Way,True,going my way
16,1945,1946,Picture,The Lost Weekend,True,lost weekend
24,1946,1947,Picture,The Best Years Of Our Lives,True,best years of our lives
35,1947,1948,Picture,Gentleman's Agreement,True,gentleman's agreement


### Movie List

In [9]:
# Delete row with NaN in the title column
movies_df_orig.dropna(subset=['title'], inplace = True)

# Delete columns that are not needed
movies_df = movies_df_orig.drop(columns={'id', 'imdb_id', 'original_language', 'overview', 'production_companies',
       'production_countries', 'runtime', 'spoken_languages', 'status', 'tagline', 'vote_average', 'vote_count',
       'production_companies_number', 'production_countries_number', 'spoken_languages_number', 'original_title'})

movies_df = movies_df.rename(columns={"title":"old_title"})
movies_df['title'] = movies_df['old_title'].str.lower()
movies_df['release_year'] = pd.DatetimeIndex(movies_df['release_date']).year.astype('Int64')
movies_df['budget'] = movies_df['budget'].replace(0,np.NaN)    
movies_df['profit'] = movies_df['revenue'] - movies_df['budget']

# read the movie title list into a column for cleaning
movies_df_title_list = movies_df["title"].tolist()

for i in range(0, len(movies_df_title_list)):
    if movies_df_title_list[i][-5:] == ", the":
        movies_df_title_list[i] = movies_df_title_list[i][:-5]
    if movies_df_title_list[i][:4] == "the ":
        movies_df_title_list[i] = movies_df_title_list[i][4:]

movies_df["title"] = movies_df_title_list

#drop movies after Jan 2017
movies_df = movies_df[movies_df['release_year']<=2016]

movies_df.head()

Unnamed: 0,budget,genres,popularity,release_date,revenue,old_title,title,release_year,profit
0,,Drama|Crime,0.823904,21/10/1988,0,Ariel,ariel,1988,
1,,Drama|Comedy,0.47445,16/10/1986,0,Shadows in Paradise,shadows in paradise,1986,
2,4000000.0,Crime|Comedy,1.698,25/12/1995,4300000,Four Rooms,four rooms,1995,300000.0
3,,Action|Thriller|Crime,1.32287,15/10/1993,12136938,Judgment Night,judgment night,1993,
4,42000.0,Documentary,0.054716,01/01/2006,0,Life in Loops (A Megacities RMX),life in loops (a megacities rmx),2006,-42000.0


## Merge

In [10]:
#first merge
movies_gg_df = pd.merge(gg_movies_df, movies_df, how="outer", left_on=['film','year_film'], 
                        right_on = ['title','release_year'])

#store GG movies that did not match with Movies DB in another dataframe
gg_nan_df = movies_gg_df[movies_gg_df['title'].isnull()]
movies_nan_df = movies_gg_df[movies_gg_df['film'].isnull()]

#drop GG movies that did not match with Movies DB in merged dataframe
movies_gg_df = movies_gg_df[movies_gg_df['title'].isnull() == False]
movies_gg_df

Unnamed: 0,year_film,year_award,category,old_film_title,win,film,budget,genres,popularity,release_date,revenue,old_title,title,release_year,profit
0,1943,1944.0,Picture,The Song Of Bernadette,True,song of bernadette,,Drama,0.227639,21/12/1943,0.0,The Song of Bernadette,song of bernadette,1943,
1,1944,1945.0,Picture,Going My Way,True,going my way,,Drama|Comedy|Music,1.066151,15/05/1944,16300000.0,Going My Way,going my way,1944,
2,1945,1946.0,Picture,The Lost Weekend,True,lost weekend,1250000.0,Drama,1.197892,23/08/1945,11000000.0,The Lost Weekend,lost weekend,1945,9750000.0
3,1946,1947.0,Picture,The Best Years Of Our Lives,True,best years of our lives,2100000.0,Drama|History|Romance,0.639008,25/12/1946,23650000.0,The Best Years of Our Lives,best years of our lives,1946,21550000.0
4,1947,1948.0,Picture,Gentleman's Agreement,True,gentleman's agreement,2000000.0,Drama|Romance,0.776725,11/11/1947,7800000.0,Gentleman's Agreement,gentleman's agreement,1947,5800000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297731,,,,,,,,Documentary,0.2,02/08/1973,0.0,Ecologia,ecologia,1973,
297732,,,,,,,,Comedy,0.0003,08/03/2003,0.0,The Tom Green Show: Early Exposure - Raw Meat ...,tom green show: early exposure - raw meat and ...,2003,
297733,,,,,,,,Documentary|History|TV Movie,0.0045,21/01/2016,0.0,The Day Hitler Died,day hitler died,2016,
297734,,,,,,,,,0.00045,24/11/2006,0.0,Mobb Deep - Life of the Infamous: The Videos,mobb deep - life of the infamous: the videos,2006,


In [13]:
gg_nan_df = gg_nan_df[["year_film", "year_award", "category", "old_film_title", "win", "film"]]
movies_nan_df = movies_nan_df[["budget", "genres", "popularity", "release_date", "revenue", "old_title", 
                              "title", "release_year", "profit"]]
gg_nan_df

Unnamed: 0,year_film,year_award,category,old_film_title,win,film
11,1950,1951.0,Picture,Cyrano de Bergerac (1950),False,cyrano de bergerac (1950)
13,1951,1952.0,Best Motion Picture - Drama,Quo Vadis?,False,quo vadis?
14,1951,1952.0,Best Motion Picture - Drama,A Streetcar Named Desire (1951),False,a streetcar named desire (1951)
25,1952,1953.0,Best Motion Picture - Musical or Comedy,I'll See You in My Dreams,False,i'll see you in my dreams
27,1952,1953.0,Best Motion Picture - Musical or Comedy,Star and Stripes Forever,False,star and stripes forever
...,...,...,...,...,...,...
668,2012,2013.0,Best Motion Picture - Musical or Comedy,"Best Exotic Marigold Hotel, The",False,best exotic marigold hotel
670,2012,2013.0,Best Motion Picture - Musical or Comedy,Salmon Fishing in the Yemen,False,salmon fishing in the yemen
671,2012,2013.0,Best Motion Picture - Musical or Comedy,Silver Linings Playbook,False,silver linings playbook
676,2012,2013.0,Best Motion Picture - Animated,Wreck-It Ralph,False,wreck-it ralph


In [21]:
#clean gg_nan_df to match movies data

#Clean years (XXXX)
clean_years = gg_nan_df["film"].agg(lambda x: x.split(" (")[0])

#Clean , the
clean_the = clean_years.agg(lambda x: x.split(", the")[0])

gg_nan_df["film"] = clean_the.tolist()


movies_gg_df2 = pd.merge(gg_nan_df, movies_nan_df, how="outer", left_on=['film','year_film'], 
                        right_on = ['title','release_year'])

gg_nan_df2 = movies_gg_df2[movies_gg_df2['title'].isnull()]
gg_nan_df2
# clean_parens = gg_nan_df["film"].agg(lambda x: x.split(" (")[0]).tolist()
# clean_parens
# clean question mark
# try merging based on plus and minus one year
#star wars new hope is new hope.
# clean_parens = gg_nan_df["film"].tolist()
# clean_parens
# gg_nan_df["film"] = df_clean_accidents["Start_Time"].agg(lambda x: x.split("-")[0])
# clean_years
# clean_punctuation = clean_the.agg(lambda x: x.split(" \")[0])
#Clean 

Unnamed: 0,year_film,year_award,category,old_film_title,win,film,budget,genres,popularity,release_date,revenue,old_title,title,release_year,profit
1,1951,1952.0,Best Motion Picture - Drama,Quo Vadis?,False,quo vadis?,,,,,,,,,
3,1952,1953.0,Best Motion Picture - Musical or Comedy,I'll See You in My Dreams,False,i'll see you in my dreams,,,,,,,,,
4,1952,1953.0,Best Motion Picture - Musical or Comedy,Star and Stripes Forever,False,star and stripes forever,,,,,,,,,
5,1956,1957.0,Best Motion Picture - Drama,Around the World in 80 Days,True,around the world in 80 days,,,,,,,,,
7,1958,1959.0,Picture - Musical,Damn Yankees,False,damn yankees,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,2019,2020.0,Best Motion Picture - Musical or Comedy,Rocketman,False,rocketman,,,,,,,,,
104,2019,2020.0,Best Motion Picture - Animated,Missing Link,True,missing link,,,,,,,,,
105,2019,2020.0,Best Motion Picture - Animated,How to Train Your Dragon: The Hidden World,False,how to train your dragon: the hidden world,,,,,,,,,
106,2019,2020.0,Best Motion Picture - Animated,Frozen II,False,frozen ii,,,,,,,,,


In [24]:
movies_gg_df[movies_gg_df['title'].str.contains("frozen")].head(10)

Unnamed: 0,year_film,year_award,category,old_film_title,win,film,budget,genres,popularity,release_date,revenue,old_title,title,release_year,profit
687,2013.0,2014.0,Best Motion Picture - Animated,Frozen,True,frozen,150000000.0,Animation|Adventure|Family,12.638616,27/11/2013,1274219000.0,Frozen,frozen,2013,1124219000.0
5908,,,,,,,,Drama,1.12746,01/08/2008,0.0,Frozen River,frozen river,2008,
9726,,,,,,,,Drama|Comedy,0.71921,01/01/2002,0.0,Frozen Stiff,frozen stiff,2002,
11372,,,,,,,,Drama,0.954213,30/12/2008,0.0,A Frozen Flower,a frozen flower,2008,
11611,,,,,,,2000000.0,Drama,0.83061,14/01/2005,0.0,Frozen Land,frozen land,2005,-2000000.0
16189,,,,,,,,Thriller,0.648738,12/03/2005,0.0,Frozen,frozen,2005,
19395,,,,,,,,Crime|Drama|History|Thriller,0.24531,01/06/2009,0.0,Winter of Frozen Dreams,winter of frozen dreams,2009,
22910,,,,,,,,Drama|Thriller,0.042184,11/12/2009,0.0,Frozen Kiss,frozen kiss,2009,
25715,,,,,,,,Drama|Mystery|Thriller,0.011934,01/01/2005,0.0,Frozen Days,frozen days,2005,
29999,,,,,,,,Thriller,1.40282,05/02/2010,3065860.0,Frozen,frozen,2010,


In [None]:
gg_nan_df

In [None]:
# movies_df.loc[(movies_df[“title”] == “i’ll see you in my dreams”)]
# movies_gg_df = movies_gg_df[movies_gg_df['title'].isnull() == False]

In [None]:
# drop release date

In [None]:
movies_gg_df

In [None]:
gg_movies_df['film'].value_counts()

In [None]:
movies_df

In [None]:

# connection_string = "postgres:postgres@localhost:5432/customer_db"
# engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# Confirm tables
# engine.table_names()

In [None]:
# premise_transformed.to_sql(name='premise', con=engine, if_exists='append', index=True)

# county_transformed.to_sql(name='county', con=engine, if_exists='append', index=True)