In [6]:
import os
import typing as t
import pandas as pd

In [9]:
cwd: str = os.getcwd()  # Replacement for: os.path.dirname(os.path.abspath(__file__))
BASE_DIR: str = os.path.dirname(cwd)
# print(cwd)
# print(BASE_DIR)
DATA_DIR: str = os.path.join(BASE_DIR, "data")  # raw
CACHE_DIR: str = os.path.join(BASE_DIR, "cache")  # processed
os.makedirs(CACHE_DIR, exist_ok=True)

# Raw combined file to process
working_file: str = os.path.join(CACHE_DIR, "movies-box-office-dataset.csv")
output_file: str = os.path.join(CACHE_DIR, "movies-box-office-dataset-cleaned.csv")

In [10]:
df = pd.read_csv(working_file)
df.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1,year,filename
0,1,The Dark Knight,"$1,003,045,358","$533,345,358",53.2%,"$469,700,000",46.8%,2008,2008.csv
1,2,Indiana Jones and the Kingdom of the Crystal S...,"$790,653,942","$317,101,119",40.1%,"$473,552,823",59.9%,2008,2008.csv
2,3,Kung Fu Panda,"$631,744,560","$215,434,591",34.1%,"$416,309,969",65.9%,2008,2008.csv
3,4,Hancock,"$629,443,428","$227,946,274",36.2%,"$401,497,154",63.8%,2008,2008.csv
4,5,Mamma Mia!,"$609,841,637","$144,130,063",23.6%,"$465,711,574",76.4%,2008,2008.csv


In [14]:
# Rename the columns
df.rename(columns={
    '%': 'Domestic %',
    '%.1': 'Foreign %',
    'year': 'Year',
    'filename': 'Filename'
}, inplace=True)

df.columns = df.columns.str.replace(" ", "_")
df.columns

Index(['Rank', 'Release_Group', 'Worldwide', 'Domestic', 'Domestic_%',
       'Foreign', 'Foreign_%', 'Year', 'Filename'],
      dtype='object')

In [16]:
# Clean the symbols from the strings. Different than tutorial.
# Will use df.apply(axis=1) to apply this custom func to df.
def remove_symbols_from_str(str_value: str) -> str:
    """
    Removes '$,' symbols/punctuation from values.
    """
    cleaned_str_value: str = str_value.replace("$", "").replace(",", "")
    return cleaned_str_value

def convert_str_to_int(str_value: str) -> int:
    """
    Converts the string value to integer data type.
    """
    try:
        int_value: int = int(str_value)
    except ValueError:
        # Some currencies have "-" values
        int_value: int = 0
    return int_value

cols_to_clean: t.List[str] = ['Worldwide', 'Domestic', 'Foreign']
def remove_symbols_and_convert(row, cols: t.List[str] = cols_to_clean):
    """
    Traverses a Pandas DataFrame row and removes symbols
    from string and converts to integer.
    Ex. $144,130,063 -> 144130063

    row = Pandas Series
    cols = List of columns to clean within row 
    """
    for col in cols:
        current_val: str = row[col]
        cleaned_val: int = convert_str_to_int(remove_symbols_from_str(current_val))
        # Reassign to update the row
        row[col] = cleaned_val
    
    return row

df_clean = df.apply(remove_symbols_and_convert, axis=1)  # axis=1 traverse COLS
df_clean.dtypes



Rank              int64
Release_Group    object
Worldwide         int64
Domestic          int64
Domestic_%       object
Foreign           int64
Foreign_%        object
Year              int64
Filename         object
dtype: object

In [19]:
# Address the % percentage columns to convert to FLOAT
df_clean['Domestic_%'] = df_clean['Domestic'] / df_clean['Worldwide']
df_clean['Foreign_%'] = df_clean['Foreign'] / df_clean['Worldwide']
df_clean.dtypes

Rank               int64
Release_Group     object
Worldwide          int64
Domestic           int64
Domestic_%       float64
Foreign            int64
Foreign_%        float64
Year               int64
Filename          object
dtype: object

In [26]:
# Sort by Worldwide value and reset index and 'Rank' values
# This will get all movies to share single Index
df_clean.sort_values(by=['Worldwide'], ascending=False, inplace=True)
df_clean.reset_index(drop=True, inplace=True)

# Update 'Rank' to be aligned with new Index + 1
df_clean.index
df_clean['Rank'] = df_clean.index + 1
df_clean.head()
df_clean.dtypes

Rank               int64
Release_Group     object
Worldwide          int64
Domestic           int64
Domestic_%       float64
Foreign            int64
Foreign_%        float64
Year               int64
Filename          object
dtype: object

In [29]:
# Export processed/cleaned dataset to CSV in 'output_file' path
# Preview the export:
# print(df_clean[:5].to_csv())
df_clean.to_csv(output_file, index=False)