# Cleaning the Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
from zipfile import ZipFile # for unzipping the db.zip file
pd.set_option('display.max_columns', None) # show all columns when printing out dataframe

## "The Numbers" Movie Budgets

In [2]:
tn_budgets_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
tn_budgets_df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [3]:
# remove commas and $ from the strings of numeric columns
tn_budgets_df['production_budget'] = tn_budgets_df['production_budget'].str.replace('$', '').str.replace(',', '')
tn_budgets_df['domestic_gross'] = tn_budgets_df['domestic_gross'].str.replace('$', '').str.replace(',', '')
tn_budgets_df['worldwide_gross'] = tn_budgets_df['worldwide_gross'].str.replace('$', '').str.replace(',', '')

In [5]:
# convert numeric columns to ints
tn_budgets_df['production_budget'] = tn_budgets_df['production_budget'].astype(int)
tn_budgets_df['domestic_gross'] = tn_budgets_df['domestic_gross'].astype(int)
tn_budgets_df['worldwide_gross'] = tn_budgets_df['worldwide_gross'].astype(int)

In [6]:
# create a column for the profit of every movie
tn_budgets_df['gross_profit'] = 0
tn_budgets_df['gross_profit'] = tn_budgets_df['worldwide_gross'] - tn_budgets_df['production_budget']

In [7]:
# divide all budget columns by 1M to make numbers easier to work with
tn_budgets_df['production_budget'] = round(tn_budgets_df['production_budget'] / 1000000, 3)
tn_budgets_df['domestic_gross'] = round(tn_budgets_df['domestic_gross'] / 1000000, 3) 
tn_budgets_df['worldwide_gross'] = round(tn_budgets_df['worldwide_gross'] / 1000000, 3) 
tn_budgets_df['gross_profit'] = round(tn_budgets_df['gross_profit'] / 1000000, 3)

In [8]:
# access to the specific year a movie was made in
tn_budgets_df['movie_year'] = tn_budgets_df['release_date'].map(lambda x: x[-4:]).astype(int)

In [9]:
# create a datetime column
tn_budgets_df["date_time"] = pd.to_datetime(tn_budgets_df['release_date'])

In [10]:
# create an ROI column
tn_budgets_df['roi'] = tn_budgets_df['gross_profit'] / tn_budgets_df['production_budget']

In [11]:
tn_budgets_df['two_x'] = 0
tn_budgets_df['two_x'] = tn_budgets_df['roi'].map(lambda x: 1 if x >= 1 else 0)

In [12]:
# removing data from the table for complete losses, seems to be impacted mostly by streamers or non-released movies
tn_budgets_df = tn_budgets_df.loc[tn_budgets_df['roi'] != -1]
tn_budgets_df.shape

(5410, 11)

In [13]:
# Creating quartile groups based on production budget
tn_budgets_df['quartile'] = 0
tn_budgets_df["quartile"] = pd.qcut(tn_budgets_df["production_budget"], q=4, labels=["Bottom 25", "25-50", "50-75", "Top 25"])

Actual creation of the new CSV file

In [None]:
# tn_budgets_df.to_csv("cleaned_budget_data.csv", index=False)

In [14]:
tn_budgets_df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,gross_profit,movie_year,date_time,roi,two_x,quartile
0,1,"Dec 18, 2009",Avatar,425.0,760.508,2776.345,2351.345,2009,2009-12-18,5.532576,1,Top 25
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410.6,241.064,1045.664,635.064,2011,2011-05-20,1.546673,1,Top 25
2,3,"Jun 7, 2019",Dark Phoenix,350.0,42.762,149.762,-200.238,2019,2019-06-07,-0.572109,0,Top 25
3,4,"May 1, 2015",Avengers: Age of Ultron,330.6,459.006,1403.014,1072.414,2015,2015-05-01,3.243842,1,Top 25
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317.0,620.181,1316.722,999.722,2017,2017-12-15,3.153697,1,Top 25
