In [204]:
import os
import importlib
import pandas as pd
import numpy as np
import code.data_preparation as dataprep
import code.utils as utils

### Exploratory Review and Prep of `tn.movie_budgets.csv`
* Note that this file is located in project root folder `./`
* Data folder `./data`; code folder - `./code`; zipped data folder - `./zippedData`; config folder - `./config`

In [228]:
### LOAD Config
importlib.reload(utils)
user_config = "./config/user_config.json"
json_config = "./config/config.json"
if os.path.exists(user_config):
    utils.convert_user_config_to_json(user_config)
config = utils.load_json_config_from_file(json_config)

In [229]:
df = pd.read_csv('./data/tn.movie_budgets.csv',
                 sep      = ',',
                 header   = 0,
                 encoding = 'utf-8',
                 engine   = 'python',
                 quotechar= '"',
                 quoting  = 0)   
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [230]:
df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [231]:
print(df.release_date.apply(lambda d: int(d[-4:])).min())
print(df.release_date.apply(lambda d: int(d[-4:])).max())

1915
2020


In [232]:
df.loc[df.release_date.apply(lambda d: np.uint16(d[-4:]))==1915]

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
5677,78,"Feb 8, 1915",The Birth of a Nation,"$110,000","$10,000,000","$11,000,000"


In [233]:
df.loc[df['domestic_gross'].apply(lambda d: not d.startswith('$'))]

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross


In [234]:
df.loc[df['worldwide_gross'].apply(lambda d: not d.startswith('$'))]

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross


In [237]:
importlib.reload(dataprep)
dataprep.prepare_clean_data(config)

In [238]:
df = pd.read_csv('./data/clean.tn.movie_budgets.csv',
                 sep      = ',',
                 header   = 0,
                 encoding = 'utf-8',
                 engine   = 'python',
                 quotechar= '"',
                 quoting  = 0) 

In [239]:
df.head()

Unnamed: 0,title,year,domestic_gross,foreign_gross
0,Pirates of the Caribbean: On Stranger Tides,2011,241063875,804600000
1,Dark Phoenix,2019,42762350,107000000
2,Avengers: Age of Ultron,2015,459005868,944008095
3,Star Wars Ep. VIII: The Last Jedi,2017,620181382,696540365
4,Star Wars Ep. VII: The Force Awakens,2015,936662225,1116648995


In [240]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           2191 non-null   object
 1   year            2191 non-null   int64 
 2   domestic_gross  2191 non-null   int64 
 3   foreign_gross   2191 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 68.6+ KB


In [241]:
importlib.reload(dataprep)
dataprep.prepare_clean_data(config)

In [242]:
dfB = dataprep.load_clean_bom_movie_gross(config)
dfT = dataprep.load_clean_tn_movie_gross(config)

In [243]:
dfB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           3387 non-null   object
 1   year            3387 non-null   uint16
 2   domestic_gross  3387 non-null   uint64
 3   foreign_gross   3387 non-null   uint64
dtypes: object(1), uint16(1), uint64(2)
memory usage: 86.1+ KB


In [244]:
dfT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           2191 non-null   object
 1   year            2191 non-null   int64 
 2   domestic_gross  2191 non-null   uint64
 3   foreign_gross   2191 non-null   uint64
dtypes: int64(1), object(1), uint64(2)
memory usage: 68.6+ KB


In [245]:
importlib.reload(dataprep)
dfRevenue = dataprep.combine_clean_bom_and_tn_revenue_data(config)
dfRevenue.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4306 entries, 0 to 4305
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           4306 non-null   object
 1   year            4306 non-null   uint16
 2   domestic_gross  4306 non-null   uint64
 3   foreign_gross   4306 non-null   uint64
dtypes: object(1), uint16(1), uint64(2)
memory usage: 143.0+ KB


In [246]:
dfRevenue.head(25)

Unnamed: 0,title,year,domestic_gross,foreign_gross
0,#HORROR,2015,0,0
1,'71,2015,1300000,355000
2,"1,000 TIMES GOOD NIGHT",2014,53900,0
3,10 CLOVERFIELD LANE,2016,72100000,38100000
4,10 DAYS IN A MADHOUSE,2015,14616,0
5,10 YEARS,2012,203000,0
6,1001 GRAMS,2015,11000,0
7,102 NOT OUT,2018,1300000,10900000
8,11-11-11,2011,32800,5700000
9,12 STRONG,2018,45819713,25298665


In [247]:
importlib.reload(dataprep)
dataprep.merge_clean_data(config)
dfMerge = dataprep.load_merged_clean_data(config)
dfMerge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108629 entries, 0 to 108628
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           108629 non-null  object 
 1   title            108629 non-null  object 
 2   year             108629 non-null  uint16 
 3   runtime_minutes  108629 non-null  uint16 
 4   genres           108629 non-null  object 
 5   rating           27962 non-null   float16
 6   numvotes         27962 non-null   float32
 7   domestic_gross   4181 non-null    float64
 8   foreign_gross    4181 non-null    float64
dtypes: float16(1), float32(1), float64(2), object(3), uint16(2)
memory usage: 5.2+ MB


In [248]:
dfMerge

Unnamed: 0,tconst,title,year,runtime_minutes,genres,rating,numvotes,domestic_gross,foreign_gross
0,tt0063540,SUNGHURSH,2013,175,"Action,Crime,Drama",,,,
1,tt0066787,ONE DAY BEFORE THE RAINY SEASON,2019,114,"Biography,Drama",,,,
2,tt0069049,THE OTHER SIDE OF THE WIND,2018,122,Drama,6.898438,4517.0,,
3,tt0100275,THE WANDERING SOAP OPERA,2017,80,"Comedy,Drama,Fantasy",6.500000,119.0,,
4,tt0111414,A THIN LIFE,2018,75,Comedy,,,,
...,...,...,...,...,...,...,...,...,...
108624,tt9916160,DRØMMELAND,2019,72,Documentary,,,,
108625,tt9916170,THE REHEARSAL,2019,51,Drama,,,,
108626,tt9916186,ILLENAU - DIE GESCHICHTE EINER EHEMALIGEN HEIL...,2017,84,Documentary,,,,
108627,tt9916190,SAFEGUARD,2019,90,"Drama,Thriller",,,,
