In [45]:
import sqlalchemy
import pandas as pd
import yahooquery as yq
import sqlite3 as lite
from dateutil.relativedelta import relativedelta
import numpy as np
import datetime as dt

In [3]:
tick = yq.Ticker('aapl')
df_bal_tick = tick.balance_sheet(frequency='a', trailing=False)

In [4]:
df_bal_tick[['asOfDate','OrdinarySharesNumber', 'CapitalStock']]

Unnamed: 0_level_0,asOfDate,OrdinarySharesNumber,CapitalStock
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aapl,2019-09-30,17772940000.0,45174000000.0
aapl,2020-09-30,16976760000.0,50779000000.0
aapl,2021-09-30,16426790000.0,57365000000.0
aapl,2022-09-30,15943420000.0,64849000000.0


In [6]:
engine = sqlalchemy.create_engine('sqlite:///' + "SEC_Filings.db", execution_options={"sqlite_raw_colnames": True})
df_ticker = pd.read_sql_table('Ticker_Table', engine)
df_fin_bal = pd.DataFrame()
df_fin_inc = pd.DataFrame()
df_fin_stock = pd.DataFrame()
bal_vars = ['asOfDate', 'CashAndCashEquivalents','TotalAssets','TotalDebt','TangibleBookValue','OrdinarySharesNumber']
inc_vars = ['asOfDate', 'BasicEPS']
stock_vars = ['date', 'adjclose']
# bal_sheet_error = []
bal_sheet_error_tickers = []
inc_stat_error_tickers = []

for ticker in df_ticker['Ticker']:
    tick = yq.Ticker(ticker)
    df_bal_tick = tick.balance_sheet(frequency='a', trailing=False)
    df_inc_tick = tick.income_statement(frequency='a', trailing=False)
    try: # some companies without pricing info included in list
        df_stock_tick = tick.history(interval='1d', start='2019-04-01').reset_index(level=1)
        df_stock_tick['date'] = df_stock_tick['date'] - relativedelta(days=1) 
    except:
        continue
    try:
        df_bal_tick = df_bal_tick[bal_vars]
        df_fin_bal = pd.concat([df_fin_bal, df_bal_tick], axis=0)

    except:
        if type(df_bal_tick) == pd.DataFrame:
            # bal_sheet_error.append(f'{ticker}: Missing columns {[var for var in bal_vars if var not in df_bal.columns]}')
            bal_sheet_error_tickers.append(ticker)
        else:
            # bal_sheet_error.append(f'{ticker} Balance sheet not Available')
            bal_sheet_error_tickers.append(ticker)

    try:
        df_inc_tick = df_inc_tick[inc_vars]
        df_fin_inc = pd.concat([df_fin_inc, df_inc_tick], axis=0)

    except:
        if type(df_inc_tick) == pd.DataFrame:
            # print([var for var in inc_vars if var not in df_inc.columns])
            inc_stat_error_tickers.append(ticker)
        else:
            inc_stat_error_tickers.append(ticker)

    df_stock_tick = df_stock_tick[stock_vars]
    df_fin_stock = pd.concat([df_fin_stock, df_stock_tick], axis=0)

            
df_fin_bal.reset_index(drop=False, inplace=True)
df_fin_inc.reset_index(drop=False, inplace=True)
df_fin_stock.reset_index(drop=False, inplace=True)



In [66]:
# Remove companies with 0 assets or 0 cash
df_fin_bal = df_fin_bal.loc[(df_fin_bal['TotalAssets']!=0) & (df_fin_bal['CashAndCashEquivalents'])!=0]
df_fin_inc.fillna(value=0, inplace=True)

# Only take rows with if matched in both dataframes
df_fin_joined = df_fin_inc.merge(df_fin_bal, how='inner', on=['symbol', 'asOfDate'])

In [81]:
df_fin_stock = pd.DataFrame()

for ticker in df_ticker['Ticker']:
    tick = yq.Ticker(ticker)
    try: # some companies without pricing info included in list
        df_stock_tick = tick.history(interval='1mo', start='2019-04-01').reset_index(level=1)
        df_stock_tick['date'] = df_stock_tick['date'] - relativedelta(days=1) 
    except:
        continue
    df_stock_tick = df_stock_tick[stock_vars]
    df_fin_stock = pd.concat([df_fin_stock, df_stock_tick], axis=0)
df_fin_stock.reset_index(drop=False, inplace=True)

df_fin_stock = df_fin_stock.loc[df_fin_stock['date'].apply(type)==dt.date]
df_fin_stock['date'] = df_fin_stock['date'].astype('datetime64[ns]')

df_fin_joined = df_fin_joined.merge(df_fin_stock, how='inner', left_on=['symbol', 'asOfDate'], right_on=['symbol', 'date'])

In [84]:
df_fin_cleaned = pd.DataFrame()
# backfill missing values based on company
for ticker in df_fin_joined['symbol'].unique():
    df_fin_cleaned = pd.concat([df_fin_cleaned, df_fin_joined.loc[df_fin_joined['symbol']==ticker].sort_values(
                        by='asOfDate', ascending=True).fillna(method='ffill')])

# Remove rows with incomplete data
missing_ticks = df_fin_cleaned[df_fin_cleaned.isnull().any(axis=1)]['symbol'].unique()
df_fin_cleaned = df_fin_cleaned.loc[~df_fin_cleaned['symbol'].isin(missing_ticks)].reset_index(drop=True)

df_fin_cleaned = df_ticker.merge(df_fin_cleaned, how="right", left_on="Ticker", right_on="symbol").drop("symbol", axis=1)

In [85]:
df_fin_cleaned['MarketCap'] = df_fin_cleaned['OrdinarySharesNumber'] * df_fin_cleaned['adjclose']
df_fin_cleaned['BTM'] = df_fin_cleaned['TangibleBookValue']/df_fin_cleaned['MarketCap']
df_fin_cleaned['Gearing'] = df_fin_cleaned['TotalDebt']/df_fin_cleaned['TotalAssets']
df_fin_cleaned['CashToAssets'] = df_fin_cleaned['CashAndCashEquivalents']/df_fin_cleaned['TotalAssets']
df_fin_cleaned['logMktCap'] = df_fin_cleaned['MarketCap'].apply(np.log)
df_fin_cleaned.drop(['CashAndCashEquivalents', 'TotalAssets', 'TotalDebt', 'TangibleBookValue', 'OrdinarySharesNumber', 'adjclose', 'MarketCap'], axis=1, inplace=True)

In [87]:
# Add to database
conn = lite.connect('Financial.db')
conn.close()
engine_fin = sqlalchemy.create_engine('sqlite:///' + 'Financial.db', execution_options={"sqlite_raw_colnames": True})
df_fin_cleaned.to_sql("FinancialDataJoined", engine_fin, if_exists='replace', index=False)

1573