In [1]:
# Load packages, utilities, dotenv
import pandas as pd
import os

from etl_utils import extract_query

from dotenv import load_dotenv
load_dotenv()

db_conn_str = f"mysql+pymysql://{os.getenv('DB_USERNAME')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/earnings"

In [6]:
# Load income statement
income_statement = extract_query(db_conn_str, "SQL\yearly_income_statement.sql")

Connection Successful!
SQL script at yearly_income_statement.sql executed successfully!


In [7]:
# No missing data on the income statement
income_statement[income_statement.isna().any(axis = 1)]

Unnamed: 0,date,act_symbol,sales,gross_profit,net_income,diluted_net_eps


In [8]:
income_statement["date"] = pd.to_datetime(income_statement["date"])

In [9]:
income_statement.loc[income_statement.duplicated(subset = ["date","act_symbol"])]

Unnamed: 0,date,act_symbol,sales,gross_profit,net_income,diluted_net_eps


In [10]:
# But is the data continuous? We know companies get added at different times, but is their yearly data always available once it has been pulled?
dates = income_statement[['date','act_symbol']].groupby('act_symbol').agg(['min','max', 'size'])

In [11]:
dates[('date', 'date_diff')] = ((dates[('date', 'max')] - dates[('date', 'min')])/pd.Timedelta(days = 365.25)).round() + 1

In [12]:
# Do any not match??
dates.loc[dates[('date','size')] != dates[('date', 'date_diff')]]

Unnamed: 0_level_0,date,date,date,date
Unnamed: 0_level_1,min,max,size,date_diff
act_symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AACG,2014-12-31,2024-12-31,10,11.0
ABCM,2020-06-30,2022-12-31,5,4.0
ABCO,2012-12-31,2016-12-31,4,5.0
ABIO,2012-12-31,2023-12-31,11,12.0
ABVC,2018-12-31,2024-12-31,6,7.0
...,...,...,...,...
YTRA,2016-03-31,2024-03-31,8,9.0
YVR,2014-02-28,2021-11-30,11,9.0
ZIVO,2018-12-31,2021-12-31,3,4.0
ZN,2013-12-31,2018-12-31,5,6.0


In [37]:
income_statement.dtypes

date               datetime64[ns]
act_symbol                 object
sales                     float64
gross_profit              float64
net_income                float64
diluted_net_eps           float64
month_end          datetime64[ns]
dtype: object

In [38]:
income_statement['month_end'] = income_statement['date'] + pd.offsets.MonthEnd(0)

In [40]:
income_statement[income_statement['date'] != income_statement['month_end']]

Unnamed: 0,date,act_symbol,sales,gross_profit,net_income,diluted_net_eps,month_end


In [6]:
# Balance sheet components
balance_sheet_components = ["assets", "equity", "liabilities"]
bs_dfs = []

for component in balance_sheet_components:
    query_path = f"SQL\yearly_balance_sheet_{component}.sql"

    res =  extract_query(db_conn_str, query_path)

    bs_dfs.append(res)

Connection Successful!
SQL script at yearly_balance_sheet_assets.sql executed successfully!
Connection Successful!
SQL script at yearly_balance_sheet_equity.sql executed successfully!
Connection Successful!
SQL script at yearly_balance_sheet_liabilities.sql executed successfully!


In [7]:
# Merge all dataframes
df = income_statement.copy()

for df_i in bs_dfs:
    cur_rows = len(df)

    df = df.merge(df_i, how = "left", on = ["date", "act_symbol"])

    if len(df) != cur_rows:
        print("Row number changed indicating duplicates. Ending process.")
        break

In [8]:
# There are many rows with blanks
df[df.isna().any(axis = 1)]

Unnamed: 0,date,act_symbol,sales,gross_profit,net_income,diluted_net_eps,total_current_assets,total_assets,total_equity,book_value_per_share,current_portion_long_term_debt,total_current_liabilities,long_term_debt,total_liabilities
22,2015-12-31,AAAP,98000000.0,78000000.0,-19000000.0,-0.56,,,,,,,,
29,2013-12-31,AAC,116000000.0,116000000.0,1000000.0,0.12,,,,,,,,
163,2024-12-31,AARD,0.0,0.0,-21000000.0,-5.15,,,,,,,,
183,2013-12-31,AAU,0.0,0.0,-6000000.0,-0.10,,,,,,,,
186,2016-12-31,AAU,0.0,0.0,-3000000.0,-0.04,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62466,2020-12-31,ZLAB,49000000.0,32000000.0,-269000000.0,-3.46,,,,,,,,
62513,2019-12-31,ZOM,0.0,0.0,-20000000.0,-0.19,,,,,,,,
62527,2013-12-31,ZSAN,4000000.0,4000000.0,-6000000.0,-1.10,,,,,,,,
62538,2019-03-31,ZTEK,0.0,0.0,-2000000.0,-0.03,,,,,,,,


In [22]:
df["date"] = pd.to_datetime(df["date"])

In [10]:
df

Unnamed: 0,date,act_symbol,sales,gross_profit,net_income,diluted_net_eps,total_current_assets,total_assets,total_equity,book_value_per_share,current_portion_long_term_debt,total_current_liabilities,long_term_debt,total_liabilities
0,2012-10-31,A,6.858000e+09,3.604000e+09,1.153000e+09,3.27,4.629000e+09,1.053600e+10,5.185000e+09,14.88,0.0,1.893000e+09,2.112000e+09,5.351000e+09
1,2013-10-31,A,6.782000e+09,3.535000e+09,7.240000e+08,2.10,4.983000e+09,1.068600e+10,5.289000e+09,15.99,0.0,1.602000e+09,2.699000e+09,5.397000e+09
2,2014-10-31,A,6.981000e+09,3.593000e+09,5.040000e+08,1.49,5.500000e+09,1.083100e+10,5.301000e+09,15.90,0.0,1.702000e+09,2.762000e+09,5.530000e+09
3,2015-10-31,A,4.038000e+09,2.041000e+09,4.010000e+08,1.20,3.686000e+09,7.479000e+09,4.170000e+09,12.58,0.0,9.760000e+08,1.655000e+09,3.309000e+09
4,2016-10-31,A,4.202000e+09,2.197000e+09,4.620000e+08,1.40,3.635000e+09,7.802000e+09,4.246000e+09,13.09,0.0,9.450000e+08,1.912000e+09,3.556000e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62643,2020-12-31,ZYXI,8.000000e+07,6.300000e+07,9.000000e+06,0.24,6.300000e+07,7.200000e+07,5.700000e+07,1.64,0.0,1.000000e+07,0.000000e+00,1.500000e+07
62644,2021-12-31,ZYXI,1.300000e+08,1.030000e+08,1.700000e+07,0.44,8.300000e+07,1.330000e+08,7.400000e+07,1.94,5000000.0,2.300000e+07,1.100000e+07,5.900000e+07
62645,2022-12-31,ZYXI,1.580000e+08,1.260000e+08,1.700000e+07,0.44,7.000000e+07,1.160000e+08,6.600000e+07,1.77,5000000.0,2.100000e+07,5.000000e+06,5.000000e+07
62646,2023-12-31,ZYXI,1.840000e+08,1.460000e+08,1.000000e+07,0.27,8.800000e+07,1.370000e+08,4.600000e+07,1.36,0.0,1.900000e+07,0.000000e+00,9.100000e+07
