# Cleaning the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
from zipfile import ZipFile # for unzipping the db.zip file
pd.set_option('display.max_columns', None) # show all columns when printing out dataframe

## "The Numbers" Movie Budgets

In [2]:
tn_budgets_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
tn_budgets_df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [3]:
# remove commas and $ from the strings of numeric columns
tn_budgets_df['production_budget'] = tn_budgets_df['production_budget'].str.replace('$', '').str.replace(',', '')
tn_budgets_df['domestic_gross'] = tn_budgets_df['domestic_gross'].str.replace('$', '').str.replace(',', '')
tn_budgets_df['worldwide_gross'] = tn_budgets_df['worldwide_gross'].str.replace('$', '').str.replace(',', '')

In [5]:
# convert numeric columns to ints
tn_budgets_df['production_budget'] = tn_budgets_df['production_budget'].astype(int)
tn_budgets_df['domestic_gross'] = tn_budgets_df['domestic_gross'].astype(int)
tn_budgets_df['worldwide_gross'] = tn_budgets_df['worldwide_gross'].astype(int)

In [6]:
# create a column for the profit of every movie
tn_budgets_df['gross_profit'] = 0
tn_budgets_df['gross_profit'] = tn_budgets_df['worldwide_gross'] - tn_budgets_df['production_budget']

In [7]:
# divide all budget columns by 1M to make numbers easier to work with
tn_budgets_df['production_budget'] = round(tn_budgets_df['production_budget'] / 1000000, 3)
tn_budgets_df['domestic_gross'] = round(tn_budgets_df['domestic_gross'] / 1000000, 3) 
tn_budgets_df['worldwide_gross'] = round(tn_budgets_df['worldwide_gross'] / 1000000, 3) 
tn_budgets_df['gross_profit'] = round(tn_budgets_df['gross_profit'] / 1000000, 3)

In [8]:
# access to the specific year a movie was made in
tn_budgets_df['movie_year'] = tn_budgets_df['release_date'].map(lambda x: x[-4:]).astype(int)

In [9]:
# create a datetime column
tn_budgets_df["date_time"] = pd.to_datetime(tn_budgets_df['release_date'])

In [10]:
# create an ROI column
tn_budgets_df['roi'] = tn_budgets_df['gross_profit'] / tn_budgets_df['production_budget']

In [11]:
tn_budgets_df['two_x'] = 0
tn_budgets_df['two_x'] = tn_budgets_df['roi'].map(lambda x: 1 if x >= 1 else 0)

In [12]:
# removing data from the table for complete losses, seems to be impacted mostly by streamers or non-released movies
tn_budgets_df = tn_budgets_df.loc[tn_budgets_df['roi'] != -1]

(5410, 11)

In [13]:
# Creating quartile groups based on production budget
tn_budgets_df['quartile'] = 0
tn_budgets_df["quartile"] = pd.qcut(tn_budgets_df["production_budget"], q=4, labels=["Bottom 25", "25-50", "50-75", "Top 25"])

Actual creation of the new CSV file

In [None]:
# tn_budgets_df.to_csv("cleaned_budget_data.csv", index=False)

In [15]:
tn_budgets_df.describe()

Unnamed: 0,id,production_budget,domestic_gross,worldwide_gross,gross_profit,movie_year,roi,two_x
count,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0
mean,50.23475,33.338222,44.752602,97.77828,64.440058,2003.592421,4.133304,0.484473
std,28.762457,42.495577,69.628779,178.917092,149.914269,12.550054,30.51783,0.499805
min,1.0,0.001,0.0,0.001,-200.238,1915.0,-0.999889,0.0
25%,25.0,6.0,3.2095,7.01825,-1.87475,1999.0,-0.296339,0.0
50%,50.0,19.0,20.347,33.395,11.9325,2006.0,0.888318,0.0
75%,75.0,42.0,55.8375,104.50075,67.4245,2012.0,2.97442,1.0
max,100.0,425.0,936.662,2776.345,2351.345,2019.0,1799.0,1.0
