Below, I install my own implementation of Professor Boonstra's "memoize DataFrame to disk" feature. The source code can be found at [github.com/ethho/memoize](https://github.com/ethho/memoize).

In [1]:
!python3 -m pip install git+https://github.com/ethho/memoize.git

Collecting git+https://github.com/ethho/memoize.git
  Cloning https://github.com/ethho/memoize.git to /tmp/pip-req-build-h4of09wl
  Running command git clone --filter=blob:none --quiet https://github.com/ethho/memoize.git /tmp/pip-req-build-h4of09wl
  Resolved https://github.com/ethho/memoize.git to commit bef633bd22e4acde44cccb63399a176c6cef79b9
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25h

In [2]:
import json
import re
import os
from glob import glob
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import norm, probplot
import quandl
import functools
import plotly.express as px
import plotly.graph_objects as go
from joblib import Parallel, delayed
import multiprocessing
from src.ubacktester import (
    BacktestEngine, StrategyBase, PositionBase, FeedBase,
    PlotlyPlotter, FeedID, PriceFeed, px_plot
)
from memoize.dataframe import memoize_df

%matplotlib inline
pd.options.display.float_format = '{:,.4f}'.format

# 20230126_hw3_ho_ethan_12350006

@mpcs
@finm33550

Ethan Ho 1/20/2023

----


## Configuration & Helper Functions

The following cell contains helper functions and configuration options that I will use in this notebook.

In [3]:
def get_secrets(fp='./secrets.json'):
    """
    Reads secret values such as API keys from a JSON-formatted file at `fp`.
    """
    with open(fp, 'r') as f:
        data = json.load(f)
    return data

def get_quandl_api_key() -> str:
    """
    Returns Quandl API key stored in secrets.json.
    """
    secrets = get_secrets()
    key = secrets.get('NASTAQ_DATA_API_KEY')
    assert key, f"NASTAQ_DATA_API_KEY field in secrets.json is empty or does not exist"
    return key

def strip_str_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Given a DataFrame, strips values in columns with string or object
    dtype. I noticed that this was an issue when I saw some m_ticker values
    like "AAPL       " with trailing whitespace.
    """
    for col in df.columns:
        if pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
            df[col] = df[col].str.strip()
    return df

@memoize_df(cache_dir='/tmp/memoize')
def fetch_quandl_quotemedia_prices(
    start_date, end_date, ticker
) -> pd.DataFrame:
    df = quandl.get_table(
        'QUOTEMEDIA/PRICES',
        date={'gte': start_date, 'lte': end_date},
        ticker=ticker,
        api_key=get_quandl_api_key(),
        paginate=True,
    )
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by='date', inplace=True)
    return df

@memoize_df(cache_dir='/tmp/memoize')
def fetch_quandl_tbill_prices(
    start_date, end_date,
) -> pd.DataFrame:
    """Fetch table of treasury bill prices from Quandl."""
    df = quandl.get(
        ['USTREASURY/BILLRATES'],
        returns="pandas",
        start_date=start_date,
        end_date=end_date,
        ticker=ticker,
        api_key=get_quandl_api_key(),
    )
    df = df.reset_index().rename(columns={'Date': 'date'})
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by='date', inplace=True)
    return df

def unique_index_keys(df, level=0) -> List[str]:
    return df.index.get_level_values(level=level).unique().tolist()

def risk_free_rate(**kw) -> float:
    """Calculates risk-free rate R_f from the 3-month T-bill rate."""
    tbill_prices = fetch_quandl_tbill_prices(**kw)
    tbill_returns = tbill_prices['USTREASURY/BILLRATES - 13 Wk Coupon Equiv']
    return tbill_returns.mean()

# Fetch Data Tables from Quandl

First, let's fetch the Zacks Fundmentals B. I chose to download zip archives as documented in the [Quandl API docs](https://github.com/quandl/quandl-python/blob/master/FOR_DEVELOPERS.md#datatable).

In [4]:
def fetch_zfb_table(table: str, include_cols: Optional[List] = None,
                    start_date: str = "2014-07-01", end_date: str = "2022-01-01") -> pd.DataFrame:
    table_lower = table.lower()
    zip_fp = f'data/zacks_{table.lower()}.zip'
    if os.path.isfile(zip_fp):
        print(f"Zip file {zip_fp} already exists. Skipping download")
    else:
        export_table_kwargs = dict(
            filename=zip_fp,
            api_key=get_quandl_api_key(),
        )
        if table not in ('MT', 'HDM'):
            export_table_kwargs['per_end_date'] = {
                'gte': start_date, 'lte': end_date
            }
        if include_cols:
            export_table_kwargs['qopts'] = {'columns': include_cols}
        fp = quandl.export_table(f'ZACKS/{table}',**export_table_kwargs)
        print(f"Wrote ZIP file to {zip_fp}")
    ! unzip -o -d data/zacks_{table_lower} {zip_fp}

    # DataFrame preprocessing
    df = pd.read_csv(*glob(f'data/zacks_{table_lower}/*.csv'))
    df = strip_str_dtypes(df)

    # Set MultiIndex of date and m_ticker
    if 'per_end_date' in df.columns:
        assert 'per_type' in df.columns, f'per_type does not exist in {table=}'
        df['per_end_date'] = pd.to_datetime(df['per_end_date'])
        df.set_index(['m_ticker', 'per_end_date', 'per_type'], inplace=True)
        df.sort_index(level=(0, 1, 2), inplace=True, ascending=(True, True, False))
    else:
        df.set_index('m_ticker', inplace=True)
        df.sort_index(inplace=True)
    
    return df

In [5]:
! mkdir -p data

fc = fetch_zfb_table('FC', include_cols=[
    'ticker',
    'm_ticker',
    'per_end_date',
    'per_type',
    'filing_date',
    'zacks_x_ind_code',
    'zacks_sector_code',
    'eps_diluted_net',
    'basic_net_eps',
    'net_lterm_debt',
    'tot_lterm_debt',
])

fr = fetch_zfb_table('FR', include_cols=[
    'ticker',
    'm_ticker',
    'per_end_date',
    'per_type',
    'ret_invst',
    'tot_debt_tot_equity',
    'profit_margin',
])

# fetch_zfb_table('MT', include_cols=[])

# fetch_zfb_table('HDM', include_cols=[])

mktv = fetch_zfb_table('MKTV', include_cols=[
    'ticker',
    'm_ticker',
    'per_end_date',
    'per_type',
    'mkt_val',
])

shrs = fetch_zfb_table('SHRS', include_cols=[
    'ticker',
    'm_ticker',
    'per_end_date',
    'per_type',
    'shares_out',
])

Zip file data/zacks_fc.zip already exists. Skipping download
Archive:  data/zacks_fc.zip
  inflating: data/zacks_fc/ZACKS_FC_2_0d6bd1d0f1e9e9f3f7f17a9212664633.csv  
Zip file data/zacks_fr.zip already exists. Skipping download
Archive:  data/zacks_fr.zip
  inflating: data/zacks_fr/ZACKS_FR_2_21b9fd416a9137b052509d8259f53e5b.csv  
Zip file data/zacks_mktv.zip already exists. Skipping download
Archive:  data/zacks_mktv.zip
  inflating: data/zacks_mktv/ZACKS_MKTV_2_0920177720004cb378c3b2aead9f7622.csv  
Zip file data/zacks_shrs.zip already exists. Skipping download
Archive:  data/zacks_shrs.zip
  inflating: data/zacks_shrs/ZACKS_SHRS_2_01ee4c4e8e147a957fd5a98ccaf5cf0a.csv  


The next command will check file sizes, to make sure that we're not occupying too much disk space on graders' machines.

In [6]:
! du -hs data/zacks_*

16M	data/zacks_fc
3.6M	data/zacks_fc.zip
12M	data/zacks_fr
3.5M	data/zacks_fr.zip
14M	data/zacks_mktv
2.0M	data/zacks_mktv.zip
14M	data/zacks_shrs
1.7M	data/zacks_shrs.zip


# From HW 3 Prompt

Find $\ge$ 200 tickers where the following conditions are met for our analysis period of 1/2015 - 1/2022:

- Not in automotive, financial, or insurance sector at any point in the period
    - See `FC/ZACKS_X_IND_CODE`, `FC/ZACKS_SECTOR_CODE`, and the [classification list](http://www.zacksdata.com/app/download/247340904/Zacks+Sector+Industry+Mapping+Scheme.pdf) (and maybe `FC/ZACKS_METRICS_IND_CODE` too?)
- Debt/market cap ratio is $>0.1$ for some nontrivial amount of time. Should be about 1000-2000 companies including ASH, VTOL, ISUN, and VIVO.
- Calculation of the following ratios is feasible:
    - Debt to market cap
        - See `FR/TOT_DEBT_TOT_EQUITY`
    - Return on investment (ROI)
        - See `FR/RET_INVST`, `MKTV/MKT_VAL`, `FC/NET_LTERM_DEBT`, `FC/TOT_LTERM_DEBT`
    - Price to earnings (P/E)
        - See `FC/EPS_DILUTED_NET`, `FC/BASIC_NET_EPS`, `SHRS/SHARES_OUT`, `MKTV/MKT_VAL`, `GAAP`
        
# To-Do

- Strip string dtypes
- ffill trading day to `per_end_date` from previous trading day
- Narrow data query to only the columns we use below

# Data Munging to Get List of Tickers

From the FC table, let's filter out tickers in the excluded sectors. By the way, most of the time, when I refer to ticker, I'm actually referring to the `m_ticker`, which is easier to track historically as securities switch between exchanges.

In [7]:
fc['zacks_sector_code'] = fc['zacks_sector_code'].fillna(-1.).astype(int)
fc['zacks_x_ind_code'] = fc['zacks_x_ind_code'].fillna(-1.).astype(int)
fc.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ticker,filing_date,zacks_x_ind_code,zacks_sector_code,eps_diluted_net,basic_net_eps,net_lterm_debt,tot_lterm_debt
m_ticker,per_end_date,per_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,2014-07-31,Q,T.NA,,12,13,,,,
,2014-10-31,Q,T.NA,,12,13,,,,
,2014-10-31,A,T.NA,,12,13,,,,
,2015-01-31,Q,T.NA,,12,13,,,,
,2015-04-30,Q,T.NA,,12,13,,,,


We define which sectors we want to filter out for each column:

In [8]:
exclude_codes_spec = {
    'zacks_sector_code': (
        5, # automotive
        13, # finance
    ),
    'zacks_x_ind_code': [
        7, 8, 9, 10, 11, 210, # automotive
        *range(61, 70), # finance
        *range(85, 90), # insurance
    ]
}

In [9]:
print(f"There are {fc.index.get_level_values(0).unique().size} unique tickers before sector filtering.")

There are 11985 unique tickers before sector filtering.


We'll define a boolean mask and use it to get the list of `m_ticker`s we want to exclude.

In [10]:
fc['exclude_sector'] = (
    fc['zacks_sector_code'].isin(exclude_codes_spec['zacks_sector_code']) |
    fc['zacks_x_ind_code'].isin(exclude_codes_spec['zacks_x_ind_code'])
)
exclude_sector = fc[fc['exclude_sector']].index.get_level_values(0).unique().tolist()
exclude_sector[:10]

[nan, '#AQN', '#AVT', '#AZL', '#BLR', '#CED', '#EXH', '#FCO', '#FRP', '#INT']

In [11]:
fc.drop(index=exclude_sector, inplace=True, errors='ignore')
print(f"There are {len(exclude_sector)} tickers that are excluded due to sector.")

There are 2210 tickers that are excluded due to sector.


We removed about $\frac{1}{5}$ of the tickers with this filter.

Next, we'll filter to tickers where debt/market cap $> 1$, and where the last `per_end_date` was more than a year ago:

In [12]:
low_debt2cap_mask = (
    fr
    .groupby(level=0, group_keys=False)
    .apply(
        lambda x:
        # debt/equity was not greater than 0.1 for
        # more than one date
        not ((x['tot_debt_tot_equity'] > 0.1).sum() > 1)
    )
)
low_debt2cap = low_debt2cap_mask[low_debt2cap_mask].index.tolist()
print(f"{len(low_debt2cap)} tickers have debt/market cap ratio below threshold.")

3902 tickers have debt/market cap ratio below threshold.


In [13]:
no_recent_filing_mask = (
    fc
    .groupby(level=0)
    .apply(
        lambda x:
        pd.to_datetime('2022-01-01') - x.index.get_level_values(1)[-1]
        > pd.to_timedelta(365, 'D')
    )
)
no_recent_filing = no_recent_filing_mask[no_recent_filing_mask].index.tolist()
print(f"{len(no_recent_filing)} tickers have no filing on record in the past year.")

2956 tickers have no filing on record in the past year.


Now, use the lists of excluded `m_ticker`s calculated above to remove these rows from all tables:

In [14]:
fr.drop(index=low_debt2cap, inplace=True, errors='ignore')
fr.drop(index=exclude_sector, inplace=True, errors='ignore')
fr.drop(index=no_recent_filing, inplace=True, errors='ignore')
fc.drop(index=low_debt2cap, inplace=True, errors='ignore')
fc.drop(index=exclude_sector, inplace=True, errors='ignore')
fc.drop(index=no_recent_filing, inplace=True, errors='ignore')
mktv.drop(index=low_debt2cap, inplace=True, errors='ignore')
mktv.drop(index=exclude_sector, inplace=True, errors='ignore')
mktv.drop(index=no_recent_filing, inplace=True, errors='ignore')
shrs.drop(index=low_debt2cap, inplace=True, errors='ignore')
shrs.drop(index=exclude_sector, inplace=True, errors='ignore')
shrs.drop(index=no_recent_filing, inplace=True, errors='ignore')

At this point, we've filtered our data enough to merge all the data tables into one DataFrame without running out of memory. Since all our tables are already indexed on m_ticker, per_end_date, and per_type, we can simply merge them on the MultiIndex.

In [15]:
filings_df = (
    fc
    .merge(fr,   how='left', left_index=True, right_index=True, suffixes=('_fc', '_fr'))
    .merge(mktv, how='left', left_index=True, right_index=True, suffixes=(None, '_mktv'))
    .merge(shrs, how='left', left_index=True, right_index=True, suffixes=(None, '_shrs'))
)
filings_df['filing_date'] = pd.to_datetime(filings_df['filing_date'])
print(f"{len(unique_index_keys(filings_df))} tickers in the filings DataFrame.")

4626 tickers in the filings DataFrame.


The next step is calculating our financial ratios, which requires daily price data. We'll first fetch QuoteMedia prices like we did in HW 2.

Since QuoteMedia prices are indexed by `ticker`, not `m_ticker`, we need to be careful about possible duplicate `ticker` values. I suspect there are few to none of these, but let's check.

In [16]:
filings_df['ticker'] = filings_df['ticker_fr']
assert not filings_df['ticker'].isnull().any()
assert not filings_df.groupby(level=0).apply(lambda x: x['ticker'].unique()[0]).duplicated().any()

It seems that for our dataset, `m_ticker` maps 1:1 to `ticker`, so we can safely re-index on `ticker`:

In [17]:
filings_df = (
    filings_df
    .reset_index()
    .set_index(['ticker', 'per_end_date'])
    .drop(columns=[
        'm_ticker', 'ticker_fc', 'ticker_fr', 
        'zacks_x_ind_code', 'zacks_sector_code', 
        'exclude_sector', 'ticker_shrs'], errors='ignore')
)
filings_df

Unnamed: 0_level_0,Unnamed: 1_level_0,per_type,filing_date,eps_diluted_net,basic_net_eps,net_lterm_debt,tot_lterm_debt,ret_invst,tot_debt_tot_equity,profit_margin,mkt_val,shares_out
ticker,per_end_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ABLT,2014-12-31,A,2014-12-31,91.4700,87.4710,-1.4780,5.9900,7.5765,0.3607,1.4466,,
ABLT,2015-12-31,A,2015-12-31,-22.2000,-21.2060,-1.5650,4.4210,-2.0765,0.2883,-0.3700,,
ABLT,2016-12-31,A,2016-12-31,38.9300,37.2060,-5.6510,0.1220,3.7844,0.0621,0.6865,,
ABLT,2017-12-31,A,2017-12-31,152.5200,145.9710,0.0850,0.2490,13.3385,0.1020,2.4749,,
ABLT,2018-09-30,Q,NaT,27.1000,,0.2510,,,,1.7837,16.5600,0.0300
...,...,...,...,...,...,...,...,...,...,...,...,...
ZYXI,2021-03-31,Q,2021-04-29,-0.0200,-0.0182,-0.0230,0.3970,-1.2485,0.0089,-2.9262,,
ZYXI,2021-06-30,Q,2021-07-29,0.0700,0.0727,-0.0440,0.3710,4.8690,0.0083,9.0516,,
ZYXI,2021-09-30,Q,2021-11-02,0.1600,0.1636,-0.0730,0.3470,9.5826,0.0073,17.5559,,
ZYXI,2021-12-31,Q,2022-03-22,0.2300,0.2300,-0.0980,10.9220,10.4824,0.2215,22.0334,,


In [18]:
def get_daily_adj_close(ticker: str, start_date='2014-07-01', end_date='2022-01-01') -> pd.Series:
    """
    Gets adjusted closing prices for `ticker`.
    
    NOTE: this method ffills price so that the index includes every calendar day.
    """
    df_raw = fetch_quandl_quotemedia_prices(
        start_date=start_date,
        end_date=end_date,
        ticker=ticker,
    )
    df_raw['date'] = pd.to_datetime(df_raw['date'])
    df_raw.sort_values(by='date', ascending=True, inplace=True)
    day_idx = pd.date_range(
        pd.to_datetime(start_date) + pd.to_timedelta(1, 'D'),
        end_date
    )
    df = (
        df_raw
        .set_index('date')
        [['adj_close']]
        .reindex(day_idx, method='ffill')
        .reset_index()
        .rename(columns=dict(index='date'))
    )
    
    return df

In [19]:
def calc_roi(row, filing_row):
    """
    Calculate ROI based on current row
    and a row containing figures from the last known filing.
    """
    if filing_row is None:
        return float('nan')
    if pd.isnull(getattr(filing_row, 'adj_close', None)):
        return float('nan')
    mkt_val = filing_row.mkt_val * row.adj_close / filing_row.adj_close
    
    # Calculate debt term, prioritizing net over total
    # and more recent over date of filing
    filing_debt = None
    if not pd.isnull(filing_row.net_lterm_debt):
        filing_debt = filing_row.net_lterm_debt 
    elif not pd.isnull(filing_row.tot_lterm_debt):
        filing_debt = filing_row.net_lterm_debt 
    filing_debt = filing_debt if filing_debt and not pd.isnull(filing_debt) else 0.
    debt_vals = [
        row.net_lterm_debt,
        row.tot_lterm_debt,
        filing_debt
    ]
    for val in debt_vals:
        if not pd.isnull(val) and val:
            debt = val
            break
    else:
        debt = 0.
    
    # Calculate the "return" R for the report date
    returns = filing_row.ret_invst * (filing_debt + filing_row.mkt_val)
        
    roi = returns / (debt + mkt_val)
    return roi


def calc_pe(row, filing_row):
    """
    Calculate P/E based on current row
    and a row containing figures from the last known filing.
    
    mkt_val per share / EPS
    """
    if filing_row is None:
        return float('nan')
    if pd.isnull(getattr(filing_row, 'adj_close', None)):
        return float('nan')
    
    eps = row.eps_diluted_net if not pd.isnull(row.eps_diluted_net) else row.basic_net_eps
    mkt_val = filing_row.mkt_val * row.adj_close / filing_row.adj_close
    if not row.shares_out or not eps:
        return 0.001
    vps = mkt_val / row.shares_out
    per = vps / eps
    if per < 0.:
        per = 0.001
    return per

def get_fin_ratios(in_df: pd.DataFrame, start_date='2015-01-01') -> pd.DataFrame:
    """
    Given a DataFrame for a single ticker containing only filings data,
    indexed by `per_end_date`, calculate the following ratios, 
    accounting for missing and lagged data streams:
    
    - Debt/market cap
    - ROI
    - P/E
    """
    # Fetch price data
    # tickers = in_df.ticker.unique().tolist()
    tickers = unique_index_keys(in_df)
    assert len(tickers) == 1, f"expected only a single ticker: {tickers=}"
    ticker = tickers[0]
    prices = get_daily_adj_close(ticker=ticker)
    
    df = pd.merge(prices, in_df.reset_index(), how='left', left_on='date', right_on='filing_date')
    df = df.set_index(['date', 'per_type'])
    df = df.sort_index()
    df['filing_date'] = df['filing_date'].ffill()
    df['roi'] = float('nan')
    df['ticker'] = ticker
    if prices['adj_close'].loc[start_date:].isnull().any():
        # Exit if any adjusted closing price is
        # not available
        return df
    
    prev_q = None
    prev_a = None
    for idx, row in df.iterrows():
        date, per_type = idx
        if pd.isnull(row.filing_date):
            continue
            # pass
        if per_type == 'Q':
            # print('filing Q', date, row)
            prev_q = row
        elif per_type == 'A':
            # print('filing A', date, row)
            prev_a = row
        elif pd.isnull(per_type):
            pass
        else:
            raise NotImplementedError(f"unsupported {per_type=}")
        assert date >= row.filing_date, (date, row.filing_date)
        
        if date < pd.to_datetime(start_date):
            continue
        
        # fillna with previous values if necessary
        for attr_name in (
            'tot_debt_tot_equity',
            'eps_diluted_net',
            'basic_net_eps',
            'net_lterm_debt',
            'tot_lterm_debt',
            'ret_invst',
            'mkt_val',
            'shares_out',
            'profit_margin',
        ):
            prev_a_attr = getattr(prev_a, attr_name, None)
            prev_q_attr = getattr(prev_q, attr_name, None)
            attr = getattr(row, attr_name, None)
            if pd.isnull(attr):
                if not pd.isnull(prev_q_attr):
                    setattr(row, attr_name, prev_q_attr)
                    # print('setattr Q', date, attr, prev_q_attr)
                elif not pd.isnull(prev_a_attr):
                    setattr(row, attr_name, prev_a_attr)
                    # print('setattr A', date, attr, prev_a_attr)
                    
        # Calculate ROI based on backfilled data
        row.roi = calc_roi(row, prev_q)
        if pd.isnull(row.roi):
            # Try last annual filing if quarterly data did not work
            row.roi = calc_roi(row, prev_a)
            # print('roi A')
        else:
            # print('roi Q')
            pass
        
        # Calculate P/E 
        row.pe = calc_pe(row, prev_q)
        if pd.isnull(row.pe):
            # Try last annual filing if quarterly data did not work
            row.pe = calc_pe(row, prev_a)
        
        try:
            df.loc[idx, :] = row
        except ValueError as err:
            if 'Must have equal len keys and value when setting with an iterable' not in str(err):
                raise
            df.loc[idx, :] = [row] * len(df.loc[idx, :])
        # print(row.roi)
        # print(df.loc[idx, 'roi'])
        # assert not pd.isnull(row.roi)
            
    return df.loc[start_date:]

The above functions perform the necessary logic to calculate financial ratios for each ticker, taking into account the fallback logic for missing data.

As my fourth financial ratio, I've chosen net profit margin, which is the ratio of income to revenue for the time period.

#### Note on Performance

I could not find a good way to make the above process take less than an hour without running OOM, while also implementing the nuanced fallback logic and also making the code readable. I did parallelize the process so that multiple tickers can be processed simultaneously. My argument is that this does not significantly impact the usability of this opportunity, since a strategy trading every week or month can definitely afford to spend an hour of computation time generating this DataFrame.

Since this function takes multiple hours, I've included the output of `get_all_fin_ratios` as a CSV file with my homework submission, for the purposes of grading. The code will automatically detect the CSV file and avoid re-running this lengthy process. 

In [20]:
def temp_func(func, name, group):
    return func(group), name

def applyParallel(dfGrouped, func):
    """
    Parallelizes GroupBy.apply.
    Adapted from https://stackoverflow.com/questions/39284989/parallelize-pandas-apply
    """
    retLst, top_index = zip(*Parallel(n_jobs=multiprocessing.cpu_count())(delayed(temp_func)(func, name, group) for name, group in dfGrouped))
    return pd.concat(retLst, keys=top_index)

@memoize_df(cache_dir='./data/memoize')
def get_all_fin_ratios(tickers=None) -> pd.DataFrame:
    if not tickers:
        groups = filings_df.groupby(level=0, group_keys=True)
    else:
        groups = filings_df.loc[(tickers, slice(None)), slice(None)].groupby(level=0, group_keys=True)
    return applyParallel(groups, get_fin_ratios).reset_index()

In [None]:
%%time

df = get_all_fin_ratios(tickers=None)
# df = get_all_fin_ratios(tickers=['ASH', 'VTOL', 'ISUN', 'VIVO'])

Using cache fp='./data/memoize/get_all_fin_ratios_556db19_20230126.csv' to write results of function get_all_fin_ratios
Using cache fp='/tmp/memoize/fetch_quandl_quotemedia_prices_bdfc277_20230126.csv' to write results of function fetch_quandl_quotemedia_prices
Using cached call from /tmp/memoize/fetch_quandl_quotemedia_prices_bdfc277_20230126.csv
Using cache fp='/tmp/memoize/fetch_quandl_quotemedia_prices_4f790eb_20230126.csv' to write results of function fetch_quandl_quotemedia_prices
Using cached call from /tmp/memoize/fetch_quandl_quotemedia_prices_4f790eb_20230126.csv
Using cache fp='/tmp/memoize/fetch_quandl_quotemedia_prices_820720e_20230126.csv' to write results of function fetch_quandl_quotemedia_prices
Using cached call from /tmp/memoize/fetch_quandl_quotemedia_prices_820720e_20230126.csv
Using cache fp='/tmp/memoize/fetch_quandl_quotemedia_prices_eb5978e_20230126.csv' to write results of function fetch_quandl_quotemedia_prices
Using cached call from /tmp/memoize/fetch_quandl

We can see using the `top` command that the above process is using 8 threads:

```
top - 11:32:09 up 20 min,  0 users,  load average: 6.19, 2.24, 0.90
Tasks:  38 total,  10 running,  28 sleeping,   0 stopped,   0 zombie
%Cpu(s): 99.6 us,  0.3 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.1 si,  0.0 st
MiB Mem :   7949.0 total,   4870.5 free,   1821.5 used,   1257.1 buff/cache
MiB Swap:   2048.0 total,   2048.0 free,      0.0 used.   5833.2 avail Mem 

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND                                                                                                                                 
 3338 eho       20   0  284936 124976  51340 R 100.3   1.5   1:24.21 python                                                                                                                                  
 3339 eho       20   0  286160 124748  51140 R 100.3   1.5   1:24.02 python                                                                                                                                  
 3333 eho       20   0  286028 125180  51076 R 100.0   1.5   1:24.27 python                                                                                                                                  
 3335 eho       20   0  286084 124964  51252 R 100.0   1.5   1:24.25 python                                                                                                                                  
 3337 eho       20   0  285396 127380  51280 R  99.7   1.6   1:24.17 python                                                                                                                                  
 3332 eho       20   0  286228 125544  51632 R  99.3   1.5   1:24.03 python                                                                                                                                  
 3334 eho       20   0  286012 127076  51596 R  98.7   1.6   1:24.28 python                                                                                                                                  
 3336 eho       20   0  286372 124976  51220 R  96.7   1.5   1:23.84 python 
 ```

In [None]:
def temp_func(func, name, group):
    return func(group), name

def applyParallel(dfGrouped, func):
    retLst, top_index = zip(*Parallel(n_jobs=multiprocessing.cpu_count())(delayed(temp_func)(func, name, group) for name, group in dfGrouped))
    return pd.concat(retLst, keys=top_index)

In [None]:
df.reset_index().set_index(['ticker', 'date']).sort_index()

# Scratch Space

Check that the tickers noted in the prompt are present in our list:

In [None]:
for ticker in ('ASH', 'VTOL', 'ISUN', 'VIVO'):
    assert ticker in fr['ticker'].unique()

In [None]:
len(unique_index_keys(fr))

In [None]:
dup_filings_mask = filings_df.groupby(level=0).apply(lambda x: x[~x.filing_date.isnull()].reset_index().duplicated(['filing_date', 'per_type']).any())

In [None]:
dup_filings = unique_index_keys(dup_filings_mask[dup_filings_mask])
dup_filings[0]

In [None]:
filings_df.loc['ABLT']

In [None]:
unique_index_keys