In [21]:
import pandas as pd
import numpy as np
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm


In [22]:
#### Process Inflation Data ####

# Data source: the World Bank https://data.worldbank.org/indicator/FP.CPI.TOTL.ZG?locations=US
# Note：the data file has metadata at the top followed by the actual data, which should be skipped when the data is loaded.

# Load the inflation data file
inflation_data_path = "external_dataset/API_FP.CPI.TOTL.ZG_DS2_en_csv_v2_5994714.csv"

# Skip the first 4 rows which are metadata headers in plain text
inflation_df = pd.read_csv(inflation_data_path, skiprows=4)

# Filter the inflation data for the United States
us_inflation_df = inflation_df[inflation_df['Country Code'] == 'USA']

# We only need the year columns for inflation rates
# The years in the dataset are from 1960 to the latest available year, so we drop the 'Unnamed: 67' column which is empty
us_inflation_df = us_inflation_df.loc[:, '1960':'2022'].transpose()
us_inflation_df = us_inflation_df.reset_index()
us_inflation_df.columns = ['year', 'Inflation Rate (%)']

# Convert the 'year' from the index name to a proper year integer
us_inflation_df['year'] = us_inflation_df['year'].astype(int)

# Preview
us_inflation_df.head()

Unnamed: 0,year,Inflation Rate (%)
0,1960,1.457976
1,1961,1.070724
2,1962,1.198773
3,1963,1.239669
4,1964,1.278912


In [32]:
#### Process Movie Metadata ####

# Load the movie data
movie_data_path = 'MovieSummaries/movie.metadata.tsv'

# Define the column names
column_names = [
    'Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date',
    'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries',
    'Movie genres'
]

# Reload the dataset with the correct column names
movie_df = pd.read_csv(movie_data_path, sep='\t', header=None, names=column_names)

# Convert the 'Movie release date' to a year
movie_df['year'] = pd.to_datetime(movie_df['Movie release date'], errors='coerce').dt.year

# Convert 'year' to a nullable integer type (Int64)
movie_df['year'] = movie_df['year'].astype('Int64')

# Keep only the necessary columns for merging and the revenue adjustment
movie_df = movie_df[['Wikipedia movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'year']]

# Preview
movie_df.head()

Unnamed: 0,Wikipedia movie ID,Movie name,Movie release date,Movie box office revenue,year
0,975900,Ghosts of Mars,2001-08-24,14010832.0,2001
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,2000
2,28463795,Brun bitter,1988,,1988
3,9363483,White Of The Eye,1987,,1987
4,261236,A Woman in Flames,1983,,1983


In [29]:
#### Merge the movie dataset with the US inflation dataset #### 

# Reload the movie dataset and extract the year
movie_df['year'] = pd.to_datetime(movie_df['Movie release date'], errors='coerce').dt.year

# Merge the movie dataset with the US inflation dataset based on the 'year' column
us_movie_inflation_merged_df = pd.merge(movie_df, us_inflation_df, on='year', how='left')

# Preview
us_movie_inflation_merged_df.head()

Unnamed: 0,Wikipedia movie ID,Movie name,Movie release date,Movie box office revenue,year,Inflation Rate (%)
0,975900,Ghosts of Mars,2001-08-24,14010832.0,2001.0,2.826171
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,2000.0,3.376857
2,28463795,Brun bitter,1988,,1988.0,4.077741
3,9363483,White Of The Eye,1987,,1987.0,3.664563
4,261236,A Woman in Flames,1983,,1983.0,3.212435


In [30]:
#### Standardize movie revenue with inflation in the corresponding year ####

# Function to adjust movie box office revenue for inflation

def adjust_revenue_for_inflation(row):
    """
    Adjust the box office revenue for inflation.
    
    :param row: A row from the DataFrame which includes the revenue, the year of release, and the inflation rate.
    :return: The adjusted revenue if available, otherwise NaN.
    """
    if pd.isna(row['Movie box office revenue']) or pd.isna(row['Inflation Rate (%)']):
        # If there is no revenue data or no inflation data, we cannot adjust the revenue
        return row['Movie box office revenue']
        # The latest year in the inflation dataset we have is 2023, so we will use this as the current year
    current_year = 2023
    release_year = int(row['year'])
    revenue = row['Movie box office revenue']
    inflation_rate = row['Inflation Rate (%)']

    # Calculate the total inflation multiplier from the release year to 2023
    inflation_multiplier = (1 + inflation_rate / 100) ** (current_year - release_year)
    
    # Adjust the revenue
    adjusted_revenue = revenue * inflation_multiplier
    return adjusted_revenue

In [31]:
# Apply the function above to calculate the adjusted movie box office revenue
us_movie_inflation_merged_df['Adjusted box office revenue'] = us_movie_inflation_merged_df.apply(adjust_revenue_for_inflation, axis=1)

# Display the DataFrame with the adjusted movie box office revenue
us_movie_inflation_merged_df[['Wikipedia movie ID', 'Movie name', 'Movie box office revenue', 'Adjusted box office revenue', 'year']].head()

Unnamed: 0,Wikipedia movie ID,Movie name,Movie box office revenue,Adjusted box office revenue,year
0,975900,Ghosts of Mars,14010832.0,25866910.0,2001.0
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,,,2000.0
2,28463795,Brun bitter,,,1988.0
3,9363483,White Of The Eye,,,1987.0
4,261236,A Woman in Flames,,,1983.0
