# Correct Tabular Financial Statements Data

Handle outlier values of financial variables

In [None]:
import pandas as pd

In [None]:
# Inner join the three tabuler_fin_data files (columns symbol, calendarYear, period (with Q removed) from fin data)

files_to_load = [r'balance_sheet', r'cash_flow_statement', r'income_statement']

# Loop through and load or inner join each file onto merged
for i, file in enumerate(files_to_load):
    print('loading ' + file)
    df = pd.read_csv(r'~\Box\STAT 222 Capstone\Intermediate Data\Tabular_Fin\tabuler_fin_data(' + file + r').csv')
    # Delete the first column (index)
    df = df.iloc[:,1:]
    # Remove Q from period
    df['period'] = df['period'].str.replace('Q', '')
    # Convert to int
    df['period'] = df['period'].astype(int)
    # Print values of Period
    print(df['period'].unique())
    # Load or merge file
    if i == 0:
        merged = df
    else:
        merged = pd.merge(merged, df, left_on=['company', 'year', 'quarter'], right_on=['symbol', 'calendarYear', 'period'], how='inner')
    # Remove suffix _x
    merged.columns = merged.columns.str.replace('_x', '')
    # Harmonize suffix y
    merged.columns = merged.columns.str.replace('_y', '_' + file)
    del df
    
merged.head(10)

In [None]:
# Before edits

# Summarize all numeric columns
# use describe method, transpose, and print all rows
# round to two decimal places, no scientific notation, commas for thousands
pd.options.display.float_format = '{:,.2f}'.format
# pandas setting to display all rows
pd.set_option('display.max_rows', None)
merged.describe().T

In [None]:
# Checking outliers for a good cutoff


In [None]:
## Because many units in the financial documents are different (in the unit of 1000 or in the unit of 1)
# We try to deal with extreme values (caused by different units in webscraping) by checking for potential mis-multiplication by 1000
def deal_with_invalid_numbers(x,lower_bound, upper_bound):
    if str(x).endswith("000.0") and (x < lower_bound or x > upper_bound):
        #Divide the value by 1000 and check if it becomes more reasonable
        return x / 1000
    else:
        return x

# Check invalid data for every quantitative attribute 
for column in merged.columns:
    if merged[column].dtype == float:
        lower_bound = merged[column].quantile(0.025)  #2.5% quantile
        upper_bound = merged[column].quantile(0.975)  #97.5% quantile
        merged[column] = merged[column].apply(deal_with_invalid_numbers, args=(lower_bound, upper_bound))

In [None]:
# After edits

# Summarize all numeric columns
# use describe method, transpose, and print all rows
# round to two decimal places, no scientific notation, commas for thousands
pd.options.display.float_format = '{:,.2f}'.format
# pandas setting to display all rows
pd.set_option('display.max_rows', None)
merged.describe().T

In [None]:
# Save final tabular financial statement data
merged.to_parquet(r'~\Box\STAT 222 Capstone\Intermediate Data\Tabular_Fin\combined_corrected_tabular_financial_statements_data.parquet', index=False)