In [1]:
# setup
import time
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import yfinance as yf
from CalcBenchHandler import CalcBenchHandler as CBH

%load_ext autoreload
%autoreload 2

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
# pull list of relevant calcbench companies
dfcorps = pd.read_csv('data/ciks/calcbench_companies.csv')

# get list of tickers with available financials
ticker_dir = 'data/financials'
eqy_px_dir = 'data/equities'
tickers = [f.split('.csv')[0] for f in listdir(ticker_dir) if isfile(join(ticker_dir, f))]
tickers = [t for t in tickers if isfile(join(eqy_px_dir, f'{t}.csv'))]
tickers = [tickers[0]]

# setup
cbh = CBH()
fs_cols = cbh.INS+cbh.CFS+cbh.BS

share_cols = [
    'sharesoutstandingendofperiod',
    'avgsharesoutstandingbasic',
    'avgdilutedsharesoutstanding',
    'stockrepurchasedduringperiodshares',
    'commonstockdividendspershare'
]

debt_cols = [
    'currentlongtermdebt', 'longtermdebt',
    'totaldebt', 'lineofcreditfacilityamountoutstanding',
    'secureddebt', 'seniornotes', 'subordinateddebt',
    'convertibledebt', 'termloan', 'mortgagedebt',
    'unsecureddebt', 'mediumtermnotes',
    'trustpreferredsecurities'
]

ins_cols = [c for c in cbh.INS if c not in share_cols]
cfs_cols = [c for c in cbh.CFS if c not in share_cols]
bs_cols = [c for c in cbh.BS if c not in share_cols]

ins_chg_cols = [f'{c}_yoy_chg' for c in ins_cols]
cfs_chg_cols = [f'{c}_yoy_chg' for c in cfs_cols]
bs_chg_cols = [f'{c}_yoy_chg' for c in bs_cols]

base_cols = ins_cols+cfs_cols+bs_cols
chg_cols = ins_chg_cols+cfs_chg_cols+bs_chg_cols
out_cols = base_cols+chg_cols

# ev components
adds = [
    'currentlongtermdebt', 'longtermdebt',
    'restrictedcashandinvestmentscurrent',
    'trustpreferredsecurities'
]
subs = [
    'cash', 'availableforsalesecurities',
    'totalinvestments'
]
tx_col_desc = [
    'mtrty_dt', 'bond_sym_id',
    'company_symbol', 'issuer_nm', 'debt_type_cd',
    'scrty_ds', 'cpn_rt', 'close_pr', 'close_yld'
]

pcnt = 8
cols_flat = [f'{c}_{i}' for c in out_cols for i in range(pcnt)]

# load bond-equity links and bond trades
link_path = 'data/ciks/bonds_to_equities_link.csv'
bond_px_path = 'data/bonds/clean_bond_close_pxs.csv'

dflink = pd.read_csv(link_path)
dfpxs = pd.read_csv(bond_px_path)

# bond prices must have a transaction date
dfpxs.trans_dt = pd.to_datetime(dfpxs.trans_dt, errors='coerce')
dfpxs = dfpxs.dropna(subset=['trans_dt'])

# drop duplicate symbol/equity_cusip records
dfdupes = dflink.groupby(['SYMBOL', 'EQUITY_CUSIP']).count()
sym_counts = dfdupes.index.get_level_values(0).value_counts()
sym_dupes = sym_counts[sym_counts>1]
dfdupes = dfdupes.reset_index()
ser_eqy_cusip = dfdupes[~dfdupes.SYMBOL.isin(sym_dupes.index.values)].EQUITY_CUSIP
dflink = dflink[dflink.EQUITY_CUSIP.isin(ser_eqy_cusip)]

	save_dir: None
	verbose: True


In [3]:
def get_financials(ticker):
    ticker_path = join(ticker_dir, f'{ticker}.csv')
    df_base = pd.read_csv(ticker_path)
    # extract raw financials
    dffin = df_base.copy()

    # clean earnings_release_date
    dffin.earnings_release_date = pd.to_datetime(dffin.earnings_release_date, errors='coerce')
    dffin = dffin.dropna(subset=['earnings_release_date'])
    dffin = dffin.sort_values(by='earnings_release_date', ascending=False)

    # fill na income statement fields with with 0    
    dffin = dffin.fillna(value={k: 0 for k in fs_cols})

    # compute yoy chg
    dffin[chg_cols] = dffin[base_cols]-dffin[base_cols].shift(4)
    dffin = dffin.dropna()

    # check all fields have values
    assert dffin.isna().sum().sum() == 0, f'{ticker} has na fields!'
    
    return dffin

def get_txs(ticker, txs, links):
    # pull bond txs for ticker
    # get links to bond cusips for ticker
    df_tick_links = links[links.SYMBOL == ticker]
    # get pxs for ticker bond cusips
    df_tick_txs = txs[txs.cusip_id.isin(df_tick_links.cusip_id)]
    df_tick_txs = df_tick_txs.sort_values(by='trans_dt', ascending=False)
    return df_tick_txs

def get_eqy_prices(tickers, sd='2009-01-01', ed='2020-01-28', secs=1, verbose=True):
    # pulls daily historical equity prices from yahoo finance
    hit, missed = [], []
    for ticker in tickers:
        tick = yf.Ticker(ticker)
        ydata = tick.history(start=sd, end=ed)
        if ydata.shape[0] > 0:
            ydata['Adj Close'] = ydata.Close
            ycols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
            ydata = ydata[ycols].sort_index(ascending=False)
            ydata.to_csv(f'data/equities/{ticker}.csv', index=True)
            hit.append(ticker)
            if verbose:
                print(f'successfully saved to data/equities/{ticker.upper()}.csv')
        else:
            missed.append(tick)
        time.sleep(secs)
    return hit, missed

In [66]:
def get_txs(tickers, should_save=True):
    # EV calcs and flattening for each test ticker
    per_cols = np.array([f'{c}_{i}' for c in out_cols for i in range(pcnt)])

    for ticker in tickers:
        dffin = get_financials(ticker)
        dftxs = get_txs(ticker, dfpxs, dflink)

        eqy_px_path = f'data/equities/{ticker}.csv'
        df_eqypxs = pd.read_csv(eqy_px_path)
        df_eqypxs = df_eqypxs.rename(columns={'Adj Close': 'AdjClose'})

        if df_eqypxs.shape[0] <= 0:
            print(f'no equity price data found at {eqy_px_path}')
            continue

        # fetch prices for relevant transaction dates
        df_tx_in = pd.merge(left=dftxs.set_index('trans_dt'),
                            right=df_eqypxs.set_index('Date'),
                            how='inner', left_index=True,
                            right_index=True)
        df_tx_in.index.name = 'trans_dt'
        df_tx_in = df_tx_in.drop_duplicates()
        df_tx_in.index.name = 'trans_dt'

        mi = pd.MultiIndex.from_tuples(zip(df_tx_in.cusip_id, df_tx_in.index),
                                       names=['cusip_id', 'trans_dt'])
        df_flat = pd.DataFrame(columns=tx_col_desc+cols_flat, index=mi)
        df_flat[tx_col_desc] = df_tx_in[tx_col_desc].values

        completed = 0
        total = df_tx_in.shape[0]
        trans_dts = df_tx.in.index.values
        earn_dts = dffin.earnings_release_date
        mask = trans_dts-earn_dts[:, None]

        for trans_dt, tx in df_tx_in.iterrows():
            if completed % 250 == 0:
                print(f'{ticker}: {completed}/{total} txs')

            # find most recent period
            df_2ltm = dffin[(dffin.earnings_release_date <= trans_dt)].head(8)
            last_earnings = df_2ltm[((trans_dt-dffin.earnings_release_date).astype('timedelta64[D]') <= 90)].shape[0]

            if last_earnings > 0 and df_2ltm.shape[0] == pcnt:
                df_2ltm = df_2ltm.reset_index()
                per = df_2ltm.iloc[0]
                mkt_cap = tx.AdjClose*per.avgdilutedsharesoutstanding
                ev = per[adds].sum()-per[subs].sum()+mkt_cap
                df_2ltm[out_cols] /= ev 

                # stack rows into single column
                df_flat.loc[(tx.cusip_id, trans_dt), per_cols] = df_2ltm[out_cols].values.flatten()

            completed += 1

        # save flattened transactions
        if should_save:
            tx_out_path = f'data/bonds/transactions/{ticker}.csv'
            df_flat = df_flat.dropna()
            txcnt = df_flat.shape[0]
            if txcnt > 0:
                df_flat.to_csv(tx_out_path, index=True)
                if txcnt > 1:
                    print(f'{txcnt} {ticker} transactions saved to {tx_out_path}')
                else:
                    print(f'{txcnt} {ticker} transaction saved to {tx_out_path}')

target: A
0/4438 txs




250/4438 txs


KeyboardInterrupt: 