# Check Filing Date Across Sources

In [19]:
# Packages
import pandas as pd

Do standard inner join of files but this time be careful with the filling (sic: filing) date.

In [20]:
# Inner join the three tabuler_fin_data files (columns symbol, calendarYear, period (with Q removed) from fin data)

files_to_load = [r'balance_sheet', r'cash_flow_statement', r'income_statement']

# Loop through and load or inner join each file onto merged
for i, file in enumerate(files_to_load):
    print('loading ' + file)
    df = pd.read_csv(r'~\Box\STAT 222 Capstone\Intermediate Data\Tabular_Fin\tabuler_fin_data(' + file + r').csv')[['symbol', 'calendarYear', 'period', 'reportedCurrency', 'fillingDate']]
    # Remove Q from period
    df['period'] = df['period'].str.replace('Q', '')
    # Convert to int
    df['period'] = df['period'].astype(int)
    # Print values of Period
    print(df['period'].unique())
    # Rename fillingDate column appropriately to file
    df = df.rename(columns={'fillingDate': 'fillingDate_' + file})
    # Load or merge file
    if i == 0:
        merged = df
    else:
        merged = pd.merge(merged, df, on=['symbol', 'calendarYear', 'period', 'reportedCurrency'], how='inner')
    # Remove suffix _x
    merged.columns = merged.columns.str.replace('_x', '')
    # Harmonize suffix y
    merged.columns = merged.columns.str.replace('_y', '_' + file)
    del df
    
print('length of merged: ' + str(len(merged)))
merged.head(10)

loading balance_sheet
[3 2 1 4]
loading cash_flow_statement
[3 2 1 4]
loading income_statement
[3 2 1 4]
length of merged: 54109


Unnamed: 0,symbol,calendarYear,period,reportedCurrency,fillingDate_balance_sheet,fillingDate_cash_flow_statement,fillingDate_income_statement
0,BCE,2023,3,CAD,2023-09-30,2023-09-30,2023-09-30
1,BCE,2023,2,CAD,2023-06-30,2023-06-30,2023-06-30
2,BCE,2023,1,CAD,2023-03-31,2023-03-31,2023-03-31
3,BCE,2022,4,CAD,2022-12-31,2022-12-31,2022-12-31
4,BCE,2022,3,CAD,2022-09-30,2022-09-30,2022-09-30
5,BCE,2022,2,CAD,2022-06-30,2022-06-30,2022-06-30
6,BCE,2022,1,CAD,2022-03-31,2022-03-31,2022-03-31
7,BCE,2021,4,CAD,2021-12-31,2021-12-31,2021-12-31
8,BCE,2021,3,CAD,2021-09-30,2021-09-30,2021-09-30
9,BCE,2021,2,CAD,2021-06-30,2021-06-30,2021-06-30


In [21]:
# Limit reportedCurrency to USD
merged = merged[(merged['reportedCurrency'] == 'USD')]
merged

Unnamed: 0,symbol,calendarYear,period,reportedCurrency,fillingDate_balance_sheet,fillingDate_cash_flow_statement,fillingDate_income_statement
67,BEP,2023,3,USD,2023-11-03,2023-11-03,2023-11-03
68,BEP,2023,2,USD,2023-08-04,2023-08-04,2023-08-04
69,BEP,2023,1,USD,2023-05-05,2023-05-05,2023-05-05
70,BEP,2022,4,USD,2023-03-01,2023-03-01,2023-03-01
71,BEP,2022,3,USD,2022-11-04,2022-11-04,2022-11-04
...,...,...,...,...,...,...,...
54104,YORW,2008,1,USD,2008-05-09,2008-05-09,2008-05-09
54105,YORW,2007,4,USD,2008-03-11,2008-03-11,2008-03-11
54106,YORW,2007,3,USD,2007-11-08,2007-11-08,2007-11-08
54107,YORW,2007,2,USD,2007-08-09,2007-08-09,2007-08-09


In [22]:
# Limit to symbol and quarter in all_data_fixed_quarters
all_data_fixed_quarters = pd.read_parquet(r'~\Box\STAT 222 Capstone\Intermediate Data\All_Data\all_data_fixed_quarter_dates.parquet', columns = ['symbol', 'quarter', 'calendarYear']).drop_duplicates().rename(columns={'quarter': 'period'})
all_data_fixed_quarters 

Unnamed: 0,symbol,period,calendarYear
0,AAPL,2,2014
1,AAPL,3,2014
2,AAPL,4,2014
3,AAPL,1,2015
4,AAPL,2,2015
...,...,...,...
7329,ZTS,2,2015
7330,ZTS,3,2015
7331,ZTS,4,2015
7332,ZTS,1,2016


In [23]:
merged = pd.merge(merged, all_data_fixed_quarters, on=['symbol', 'period', 'calendarYear'], how='inner')

In [24]:
# Count cases where fillingDate_balance_sheet is not equal to fillingDate_cash_flow_statement
cnt_bs_ne_cf = len(merged[merged['fillingDate_balance_sheet'] != merged['fillingDate_cash_flow_statement']])
print('Share of cases where fillingDate_balance_sheet is not equal to fillingDate_cash_flow_statement: ' + str(cnt_bs_ne_cf/len(merged)))
print('Count of cases where fillingDate_balance_sheet is not equal to fillingDate_cash_flow_statement: ' + str(cnt_bs_ne_cf))

# Count cases where fillingDate_balance_sheet is not equal to fillingDate_income_statement
cnt_bs_ne_is = len(merged[merged['fillingDate_balance_sheet'] != merged['fillingDate_income_statement']])
print('Share of cases where fillingDate_balance_sheet is not equal to fillingDate_income_statement: ' + str(cnt_bs_ne_is/len(merged)))
print('Count of cases where fillingDate_balance_sheet is not equal to fillingDate_income_statement: ' + str(cnt_bs_ne_is))

# Count cases where fillingDate_cash_flow_statement is not equal to fillingDate_income_statement
cnt_cf_ne_is = len(merged[merged['fillingDate_cash_flow_statement'] != merged['fillingDate_income_statement']])
print('Share of cases where fillingDate_cash_flow_statement is not equal to fillingDate_income_statement: ' + str(cnt_cf_ne_is/len(merged)))
print('Count of cases where fillingDate_cash_flow_statement is not equal to fillingDate_income_statement: ' + str(cnt_cf_ne_is))

Share of cases where fillingDate_balance_sheet is not equal to fillingDate_cash_flow_statement: 0.0020452686119443687
Count of cases where fillingDate_balance_sheet is not equal to fillingDate_cash_flow_statement: 15
Share of cases where fillingDate_balance_sheet is not equal to fillingDate_income_statement: 0.0009544586855740388
Count of cases where fillingDate_balance_sheet is not equal to fillingDate_income_statement: 7
Share of cases where fillingDate_cash_flow_statement is not equal to fillingDate_income_statement: 0.0013635124079629125
Count of cases where fillingDate_cash_flow_statement is not equal to fillingDate_income_statement: 10
