In [1]:
import pandas as pd
from functools import reduce
from sqlalchemy import create_engine
import psycopg2 

In [2]:
# import and transform economic indicators

sainc_raw = pd.read_csv(r'SAINC_2000_2020.csv')
category_code = pd.read_csv(r'CATEGORY_CODE.csv')
data_type_code = pd.read_csv(r'DATA_TYPE_CODE.csv')
economic_indicators = pd.read_csv(r'economic_indicators_data.csv')


In [3]:
#create economic indicators dataframe & filter for per capita income, employment, and wages/salaries in USA

indicator_filter = ['Per capita personal income (dollars) 4/','Total employment',' Wages and salaries']
sainc_clean = pd.melt(sainc_raw, id_vars=['GeoFIPS','GeoName','Region','LineCode','Description','Unit'], var_name='Year', value_name='Metric', ignore_index=True)
economics = sainc_clean[sainc_clean['Description'].isin(indicator_filter)]
economics = economics[economics['GeoName'].isin(['United States'])] 
economics


Unnamed: 0,GeoFIPS,GeoName,Region,LineCode,Description,Unit,Year,Metric
4,"""00000""",United States,,30,Per capita personal income (dollars) 4/,Dollars,2000,30672.0
13,"""00000""",United States,,50,Wages and salaries,Millions of dollars,2000,4823765.0
20,"""00000""",United States,,7010,Total employment,Number of jobs,2000,165370800.0
1384,"""00000""",United States,,30,Per capita personal income (dollars) 4/,Dollars,2001,31617.0
1393,"""00000""",United States,,50,Wages and salaries,Millions of dollars,2001,4950126.0
...,...,...,...,...,...,...,...,...
26233,"""00000""",United States,,50,Wages and salaries,Millions of dollars,2019,9309720.0
26240,"""00000""",United States,,7010,Total employment,Number of jobs,2019,201644200.0
27604,"""00000""",United States,,30,Per capita personal income (dollars) 4/,Dollars,2020,59510.0
27613,"""00000""",United States,,50,Wages and salaries,Millions of dollars,2020,9425703.0


In [4]:
# import and transform yearly inflation factors

inflation = pd.read_csv(r'InflationTable.csv')
inflation = inflation.drop('Month', axis=1)
inflation = inflation.groupby(['Year']).mean()
inflation


Unnamed: 0_level_0,Inflation_Factor
Year,Unnamed: 1_level_1
2000,0.642856
2001,0.66102
2002,0.671513
2003,0.686767
2004,0.705133
2005,0.729019
2006,0.752543
2007,0.774049
2008,0.803586
2009,0.800904


In [5]:
# import reboot data

reboots = pd.read_csv(r'Movie Reboots + Remakes v2 (1).csv')
reboots['Film'] = reboots['Title']
reboots.drop('Title', axis=1)
reboots.head()

Unnamed: 0,Source,Position,Const,Created,Modified,Description,Title,URL,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors,Film
0,Source A,1,tt1155076,2/22/2018,2/22/2018,,The Karate Kid,https://www.imdb.com/title/tt1155076/,movie,6.2,140.0,2010.0,"Action, Drama, Family, Sport",172855.0,6/7/2010,Harald Zwart,The Karate Kid
1,Source A,2,tt0119137,2/22/2018,2/22/2018,,Flubber,https://www.imdb.com/title/tt0119137/,movie,5.3,93.0,1997.0,"Comedy, Family, Sci-Fi",87535.0,11/16/1997,Les Mayfield,Flubber
2,Source A,3,tt0115433,2/22/2018,2/22/2018,,101 Dalmatians,https://www.imdb.com/title/tt0115433/,movie,5.7,103.0,1996.0,"Adventure, Comedy, Crime, Family",108495.0,11/18/1996,Stephen Herek,101 Dalmatians
3,Source A,4,tt0367594,2/22/2018,2/22/2018,,Charlie and the Chocolate Factory,https://www.imdb.com/title/tt0367594/,movie,6.6,115.0,2005.0,"Adventure, Comedy, Family, Fantasy, Musical",452299.0,7/10/2005,Tim Burton,Charlie and the Chocolate Factory
4,Source A,5,tt1014759,2/22/2018,2/22/2018,,Alice in Wonderland,https://www.imdb.com/title/tt1014759/,movie,6.4,108.0,2010.0,"Adventure, Family, Fantasy, Mystery",399041.0,2/25/2010,Tim Burton,Alice in Wonderland


In [6]:
# import movie data

movies = pd.read_csv(r'tmdb_api_export_1996_2021.csv')
movies.head()

Unnamed: 0.1,Unnamed: 0,Film,Revenue,Year
0,0,Independence Day,817400891,1996
1,1,Twister,494471524,1996
2,2,Mission: Impossible,457731198,1996
3,3,The Rock,335062621,1996
4,4,The Hunchback of Notre Dame,325338851,1996


In [7]:
# create reboot flag in movies dataframe

movies = pd.read_csv(r'tmdb_api_export_1996_2021.csv')
movies = pd.merge(movies, reboots, how='left', on = ['Film','Year'], indicator = True)
movies['Reboot_Flag'] = (movies['_merge'] == 'both').astype(int)
movies.drop(movies.iloc[:,4:20], inplace=True, axis=1)
movies

Unnamed: 0.1,Unnamed: 0,Film,Revenue,Year,Reboot_Flag
0,0,Independence Day,817400891,1996,0
1,1,Twister,494471524,1996,0
2,2,Mission: Impossible,457731198,1996,0
3,3,The Rock,335062621,1996,0
4,4,The Hunchback of Notre Dame,325338851,1996,0
...,...,...,...,...,...
2595,2595,Chernobyl: Abyss,5370393,2021,0
2596,2596,Titane,5115725,2021,0
2597,2597,Family Swap,4849622,2021,0
2598,2598,Qismat 2,4700000,2021,0


In [8]:
# see share of reboots to total films

movies.groupby(['Reboot_Flag']).size()

Reboot_Flag
0    2400
1     200
dtype: int64

In [9]:
# join movies and inflation factor dataframes

dfs  = [inflation, movies]
df = reduce(lambda left,right: pd.merge(left,right,on='Year'), dfs)

In [10]:
# adjust revenue for inflation

df['Adjusted_Revenue'] = df['Revenue'] / df['Inflation_Factor'] 
df

Unnamed: 0.1,Year,Inflation_Factor,Unnamed: 0,Film,Revenue,Reboot_Flag,Adjusted_Revenue
0,2000,0.642856,400,Mission: Impossible II,546388105,0,8.499391e+08
1,2000,0.642856,401,Gladiator,465361176,0,7.238969e+08
2,2000,0.642856,402,Cast Away,429632142,0,6.683182e+08
3,2000,0.642856,403,What Women Want,374111707,0,5.819529e+08
4,2000,0.642856,404,Dinosaur,354248063,0,5.510538e+08
...,...,...,...,...,...,...,...
2195,2021,1.000000,2595,Chernobyl: Abyss,5370393,0,5.370393e+06
2196,2021,1.000000,2596,Titane,5115725,0,5.115725e+06
2197,2021,1.000000,2597,Family Swap,4849622,0,4.849622e+06
2198,2021,1.000000,2598,Qismat 2,4700000,0,4.700000e+06


In [11]:
# combine all economic indicator data with movies data

economics['Year'] = economics['Year'].astype(int)
merged  = [df, economics]
final_table = reduce(lambda left,right: pd.merge(left,right,on='Year'), merged)

In [12]:
# adjust economic indicators for inflation

final_table['Adjusted_Metric'] = final_table['Metric'] / final_table['Inflation_Factor']
final_table = final_table.drop(['Unnamed: 0','GeoFIPS'], axis=1)
final_table

Unnamed: 0,Year,Inflation_Factor,Film,Revenue,Reboot_Flag,Adjusted_Revenue,GeoName,Region,LineCode,Description,Unit,Metric,Adjusted_Metric
0,2000,0.642856,Mission: Impossible II,546388105,0,8.499391e+08,United States,,30,Per capita personal income (dollars) 4/,Dollars,30672.0,4.771211e+04
1,2000,0.642856,Mission: Impossible II,546388105,0,8.499391e+08,United States,,50,Wages and salaries,Millions of dollars,4823765.0,7.503652e+06
2,2000,0.642856,Mission: Impossible II,546388105,0,8.499391e+08,United States,,7010,Total employment,Number of jobs,165370800.0,2.572441e+08
3,2000,0.642856,Gladiator,465361176,0,7.238969e+08,United States,,30,Per capita personal income (dollars) 4/,Dollars,30672.0,4.771211e+04
4,2000,0.642856,Gladiator,465361176,0,7.238969e+08,United States,,50,Wages and salaries,Millions of dollars,4823765.0,7.503652e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6295,2020,0.966250,Radioactive,3507755,0,3.630277e+06,United States,,50,Wages and salaries,Millions of dollars,9425703.0,9.754932e+06
6296,2020,0.966250,Radioactive,3507755,0,3.630277e+06,United States,,7010,Total employment,Number of jobs,190776800.0,1.974404e+08
6297,2020,0.966250,18 Presents,3486375,0,3.608150e+06,United States,,30,Per capita personal income (dollars) 4/,Dollars,59510.0,6.158862e+04
6298,2020,0.966250,18 Presents,3486375,0,3.608150e+06,United States,,50,Wages and salaries,Millions of dollars,9425703.0,9.754932e+06


In [13]:
# connect to postgres

print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(
    host="localhost",
    port='5432',
    database="postgres",
    user="postgres",
    password="123")

Connecting to the PostgreSQL database...


In [14]:
# create postgres table from pandas dataframe

engine = create_engine('postgresql://postgres:123@localhost:5432/postgres')
final_table.to_sql('movies_and_economics_reboot', engine)

In [15]:
# query table

cur = conn.cursor()

query1 = 'SELECT * from movies_and_economics_reboot'
cur.execute(query1)
print("The number of rows: ", cur.rowcount)

row = cur.fetchone()
while row is not None:
    print("Year:", row[1], ",Country:", row[7],",Economic Indicator Name:", row[10], ",Film:", row[3], ",Reboot Flag:", row[5], ",Inf Adjusted Revenue:", row[6], ",Inf Adj Economic Indicator Value:", row[13])
    row = cur.fetchone()   

The number of rows:  6300
Year: 2000 ,Country: United States ,Economic Indicator Name: Per capita personal income (dollars) 4/ ,Film: Mission: Impossible II ,Reboot Flag: 0 ,Inf Adjusted Revenue: 849939063.7604128 ,Inf Adj Economic Indicator Value: 47712.112919550804
Year: 2000 ,Country: United States ,Economic Indicator Name:  Wages and salaries ,Film: Mission: Impossible II ,Reboot Flag: 0 ,Inf Adjusted Revenue: 849939063.7604128 ,Inf Adj Economic Indicator Value: 7503652.203226949
Year: 2000 ,Country: United States ,Economic Indicator Name: Total employment ,Film: Mission: Impossible II ,Reboot Flag: 0 ,Inf Adjusted Revenue: 849939063.7604128 ,Inf Adj Economic Indicator Value: 257244075.4824091
Year: 2000 ,Country: United States ,Economic Indicator Name: Per capita personal income (dollars) 4/ ,Film: Gladiator ,Reboot Flag: 0 ,Inf Adjusted Revenue: 723896875.9026785 ,Inf Adj Economic Indicator Value: 47712.112919550804
Year: 2000 ,Country: United States ,Economic Indicator Name:  Wa