In [1]:
import os
import pandas as pd

In [2]:
cwd = os.getcwd() # dirname(abspath(__file__)) jupyter notebook doesn't support __file__

In [27]:
BASE_DIR = os.path.dirname(cwd)
DATA_DIR = os.path.join(BASE_DIR,"data")
CACHE_DIR = os.path.join(BASE_DIR,"cache")
working_file = os.path.join(CACHE_DIR,"movies-box-office-dataset.csv")
output_file = os.path.join(CACHE_DIR,"movies-box-office-dataset-cleaned.csv")

In [4]:
df = pd.read_csv(working_file)

In [5]:
df.shape

(10110, 8)

In [6]:
df.head(1200) # Rank isn't matching with the count of observations

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1,Year
0,1,Mission: Impossible II,"$546,388,108","$215,409,889",39.4%,"$330,978,219",60.6%,2000
1,2,Gladiator,"$460,583,960","$187,705,427",40.8%,"$272,878,533",59.2%,2000
2,3,Cast Away,"$429,632,142","$233,632,142",54.4%,"$196,000,000",45.6%,2000
3,4,What Women Want,"$374,111,707","$182,811,707",48.9%,"$191,300,000",51.1%,2000
4,5,Dinosaur,"$349,822,765","$137,748,063",39.4%,"$212,074,702",60.6%,2000
...,...,...,...,...,...,...,...,...
1195,414,Girls Can't Swim,"$69,250","$69,250",100%,-,-,2002
1196,415,Bundy,"$68,716","$6,073",8.8%,"$62,643",91.2%,2002
1197,416,Spooky House,"$65,238","$65,238",100%,-,-,2002
1198,417,Gaudi Afternoon,"$65,115","$5,858",9%,"$59,257",91%,2002


In [7]:
df['Rank'] = -1

In [8]:
df.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1,Year
0,-1,Mission: Impossible II,"$546,388,108","$215,409,889",39.4%,"$330,978,219",60.6%,2000
1,-1,Gladiator,"$460,583,960","$187,705,427",40.8%,"$272,878,533",59.2%,2000
2,-1,Cast Away,"$429,632,142","$233,632,142",54.4%,"$196,000,000",45.6%,2000
3,-1,What Women Want,"$374,111,707","$182,811,707",48.9%,"$191,300,000",51.1%,2000
4,-1,Dinosaur,"$349,822,765","$137,748,063",39.4%,"$212,074,702",60.6%,2000


In [9]:
df['Domestic %'] = df['%']
df['Foreign %'] = df['%.1']

In [10]:
df.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1,Year,Domestic %,Foreign %
0,-1,Mission: Impossible II,"$546,388,108","$215,409,889",39.4%,"$330,978,219",60.6%,2000,39.4%,60.6%
1,-1,Gladiator,"$460,583,960","$187,705,427",40.8%,"$272,878,533",59.2%,2000,40.8%,59.2%
2,-1,Cast Away,"$429,632,142","$233,632,142",54.4%,"$196,000,000",45.6%,2000,54.4%,45.6%
3,-1,What Women Want,"$374,111,707","$182,811,707",48.9%,"$191,300,000",51.1%,2000,48.9%,51.1%
4,-1,Dinosaur,"$349,822,765","$137,748,063",39.4%,"$212,074,702",60.6%,2000,39.4%,60.6%


In [11]:
df.drop(columns=['%','%.1'],inplace=True)

In [12]:
df.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,Year,Domestic %,Foreign %
0,-1,Mission: Impossible II,"$546,388,108","$215,409,889","$330,978,219",2000,39.4%,60.6%
1,-1,Gladiator,"$460,583,960","$187,705,427","$272,878,533",2000,40.8%,59.2%
2,-1,Cast Away,"$429,632,142","$233,632,142","$196,000,000",2000,54.4%,45.6%
3,-1,What Women Want,"$374,111,707","$182,811,707","$191,300,000",2000,48.9%,51.1%
4,-1,Dinosaur,"$349,822,765","$137,748,063","$212,074,702",2000,39.4%,60.6%


In [13]:
to_clean_cols = ['Worldwide','Domestic','Foreign']

def currency_str_to_int(current_val):
    currency_val = current_val.replace("$","").replace(",","")
    try:
        currency_val = int(currency_val)
    except:
        # Takes any row value with "-" and turns into 0
        currency_val = 0
    return currency_val

def clean_col(row):
    """
    row: Pandas series
    dataframe: colllection of pandas series
    """
    for col in to_clean_cols:
        row[col] = currency_str_to_int(row[col])
    #print(row)
    return row

In [14]:
df_cleaned = df.apply(clean_col,axis=1)

In [15]:
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,Year,Domestic %,Foreign %
0,-1,Mission: Impossible II,546388108,215409889,330978219,2000,39.4%,60.6%
1,-1,Gladiator,460583960,187705427,272878533,2000,40.8%,59.2%
2,-1,Cast Away,429632142,233632142,196000000,2000,54.4%,45.6%
3,-1,What Women Want,374111707,182811707,191300000,2000,48.9%,51.1%
4,-1,Dinosaur,349822765,137748063,212074702,2000,39.4%,60.6%


In [16]:
df_cleaned.dtypes

Rank              int64
Release Group    object
Worldwide         int64
Domestic          int64
Foreign           int64
Year              int64
Domestic %       object
Foreign %        object
dtype: object

In [17]:
df_cleaned.sort_values(by=['Worldwide'],inplace=True,ascending=False)
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,Year,Domestic %,Foreign %
9387,-1,Avengers: Endgame,2797800564,858373000,1939427564,2019,30.7%,69.3%
6212,-1,Star Wars: Episode VII - The Force Awakens,2068223624,936662225,1131561399,2015,45.3%,54.7%
8487,-1,Avengers: Infinity War,2048359754,678815482,1369544272,2018,33.1%,66.9%
6213,-1,Jurassic World,1670400637,652270625,1018130012,2015,39%,61%
9388,-1,The Lion King,1656943394,543638043,1113305351,2019,32.8%,67.2%


In [18]:
df_cleaned.reset_index(inplace=True,drop=True)
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,Year,Domestic %,Foreign %
0,-1,Avengers: Endgame,2797800564,858373000,1939427564,2019,30.7%,69.3%
1,-1,Star Wars: Episode VII - The Force Awakens,2068223624,936662225,1131561399,2015,45.3%,54.7%
2,-1,Avengers: Infinity War,2048359754,678815482,1369544272,2018,33.1%,66.9%
3,-1,Jurassic World,1670400637,652270625,1018130012,2015,39%,61%
4,-1,The Lion King,1656943394,543638043,1113305351,2019,32.8%,67.2%


In [19]:
df_cleaned['Rank'] = df_cleaned.index + 1

In [20]:
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,Year,Domestic %,Foreign %
0,1,Avengers: Endgame,2797800564,858373000,1939427564,2019,30.7%,69.3%
1,2,Star Wars: Episode VII - The Force Awakens,2068223624,936662225,1131561399,2015,45.3%,54.7%
2,3,Avengers: Infinity War,2048359754,678815482,1369544272,2018,33.1%,66.9%
3,4,Jurassic World,1670400637,652270625,1018130012,2015,39%,61%
4,5,The Lion King,1656943394,543638043,1113305351,2019,32.8%,67.2%


In [25]:
df_cleaned['Domestic %'] = df_cleaned['Domestic'] / df_cleaned['Worldwide']
df_cleaned['Foreign %'] = df_cleaned['Foreign'] / df_cleaned['Worldwide']

In [26]:
df_cleaned.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Foreign,Year,Domestic %,Foreign %
0,1,Avengers: Endgame,2797800564,858373000,1939427564,2019,0.306803,0.693197
1,2,Star Wars: Episode VII - The Force Awakens,2068223624,936662225,1131561399,2015,0.452882,0.547118
2,3,Avengers: Infinity War,2048359754,678815482,1369544272,2018,0.331395,0.668605
3,4,Jurassic World,1670400637,652270625,1018130012,2015,0.390488,0.609512
4,5,The Lion King,1656943394,543638043,1113305351,2019,0.328097,0.671903


In [28]:
df_cleaned.to_csv(output_file,index=False)