# GetMetrics Notebook
#####  Uses Ticker Symbol lists from GetTickers, we collect dividend values using Yahoo Finance

In [1]:
# Import required modules
import yfinance as yf
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
# Load Ticker data from GetTickers.ipynb
kept_status = np.load("data/numpy/kept_status.npy")
lost_status = np.load("data/numpy/lost_status.npy")

In [3]:
kept_status, lost_status

(array(['CINF', 'ADM', 'BANF', 'CWT', 'SCL', 'NC', 'SPGI', 'CAT', 'ADP',
        'BRO', 'WBA', 'SWK', 'FUL', 'EMR', 'ATO', 'EBTC', 'AFL', 'CHD',
        'MDT', 'SYY', 'ECL', 'PPG', 'GRC', 'ROP', 'CSL', 'NJR', 'HRL',
        'TRI', 'TROW', 'JNJ', 'SHW', 'PG', 'SBSI', 'TNC', 'PII', 'RNR',
        'WEYS', 'WMT', 'NDSN', 'FELE', 'CVX', 'NWN', 'T', 'AROW', 'UHT',
        'BDX', 'SJW', 'DCI', 'ABM', 'NEE', 'MO', 'LANC', 'XOM', 'APD',
        'WTRG', 'PNR', 'TGT', 'MGEE', 'THFF', 'WST', 'ERIE', 'FLIC',
        'FFMR', 'GWW', 'GD', 'O', 'MCD', 'SON', 'SEIC', 'MATW', 'NIDB',
        'AWR', 'ED', 'NUE', 'LIN', 'MDU', 'JKHY', 'LECO', 'UVV', 'MSEX',
        'UBSI', 'LOW', 'TMP', 'PSBQ', 'CTBI', 'KMB', 'MGRC', 'EFSI', 'BRC',
        'LEG', 'PH', 'NNN', 'CFR', 'ATR', 'UGI', 'ITW', 'BEN', 'WABC',
        'DOV', 'SYK', 'ENB', 'CTAS', 'BMI', 'PBCT', 'CLX', 'FRT', 'CPKF',
        'MKC', 'TDS', 'TR', 'PEP', 'VFC', 'MCY', 'BKH', 'UMBF', 'IBM',
        'AOS', 'CBU', 'ESS', 'MMM', 'EXPD', 'CSVI', 'ARTNA', '

In [4]:
# Create dict with keys set to ticker symbols
kept = defaultdict(None)
lost = defaultdict(None)

for tckr in kept_status:
    kept[tckr] = yf.Ticker(tckr)

for tckr in lost_status:
    lost[tckr] = yf.Ticker(tckr)

In [5]:
size = len(kept.keys()) + len(lost.keys())
print("Total Ticker Count:", size)

Total Ticker Count: 180


In [6]:
# Function for cleaning up data
# Want to fill as many gaps as possible
# by ensuring there are 4 dividend values for each year
# Some companies may simply not post as frequent, but we wanted
# to attempt to maintain a consistent dataset length for each company
def fixData(t,status):
    # figure out which years can be filled with more samples
    years = []
    for i in range(len(t.dividends)):
        years.append(t.dividends.index[i].year)
    year, ct = np.unique(np.array(years), return_counts=True)
    fix_years = year[ct<4]
    
    # Get dividend data for current ticker
    df = t.dividends.to_frame().reset_index()

    # fill in new values based on whether the company lost or kept status
    if fix_years.size > 0:
        for yr in fix_years:
            dont_fix_years = year[ct>4]
            if yr-1 in dont_fix_years or yr+1 in dont_fix_years:
                # Adjust expected month counts
                # Take into respect instances in which the prev or
                # following year had more than 4 dividend updates
                if ct[year==yr-1] > 4:
                    ct[year==yr-1] -= 1
                    ct[year==yr] += 1
                elif ct[year==yr+1] > 4:
                    ct[year==yr+1] -= 1
                    ct[year==yr] += 1
                continue
            # Collect all the years in which there 
            # is dividend data available for the current company
            years = []
            for i in range(len(df.Dividends)):
                years.append(df.Date[i].year)
            # Get Datetime value to fill in index column for month/year
            indx = df[years==yr].index[-1]
            for i in range(4-ct[year==yr][0]):
                # insert new dividend value with 0 if lost status
                # otherwise use the previous value
                val = 0 if status in "lost" else df.loc[indx].Dividends
                df.loc[indx + 0.5] = df.loc[indx].Date,val
                df = df.sort_index().reset_index(drop=True)
                indx += 1
    
    # Locate large outliers and replace with copy of previous value
    filt = np.convolve(np.array([1,0,-1]), df.values[:,1])
    for i in range(len(filt)):
        if filt[i] > 0.1 and filt[i+2] < 0:
            df.iloc[i,1] = df.iloc[i-1,1] 
        
    
    return df

In [7]:
# Loop through kept and lost status dictionaries
# Collect dividend data and fix datasets that don't
# contain 4 dividend updates per year
# Update kept/lost lists based on data that is able to be collected
# save collected samples as csvs that can be used in ML/DL algorithms
for t in kept_status:
    try:
        tick = kept[t]
        tick.history(start='1900-1-1', end='2020-12-31')
        div = fixData(tick,"kept")
        div = pd.Series(div.Dividends.values, index=div.Date)
        if not div.empty:
            div.to_csv(f'data/series/good/{t}_dividends_fixed.csv')
    except Exception as e:
        print(f'Skipping {t}, no data available')
        
for t in lost_status:
    try:
        tick = lost[t]
        tick.history(start='1900-1-1', end='2020-12-31')
        div = fixData(tick,"lost")
        div = pd.Series(div.Dividends.values, index=div.Date)
        if not div.empty:
            div.to_csv(f'data/series/good/{t}_dividends_fixed.csv')
    except Exception as e:
        print(f'Skipping {t}, no data available')

  if ct[year==yr-1] > 4:


- BXS: No data found, symbol may be delisted
- BXS: No data found, symbol may be delisted
- BXS: No data found, symbol may be delisted
Skipping BXS, no data available
Skipping TEG, no data available
Skipping WSC, no data available
- IRET: No data found, symbol may be delisted
- IRET: No data found, symbol may be delisted
- IRET: No data found, symbol may be delisted
Skipping IRET, no data available
- CTL: No data found, symbol may be delisted
- CTL: No data found, symbol may be delisted
- CTL: No data found, symbol may be delisted
Skipping CTL, no data available
- HGIC: 1d data not available for startTime=-2208970800 and endTime=1609390800. Only 100 years worth of day granularity data are allowed to be fetched per request.
- HGIC: No data found for this date range, symbol may be delisted
- HGIC: No data found for this date range, symbol may be delisted
Skipping HGIC, no data available
Skipping MHP, no data available
- WAG: 1d data not available for startTime=-2208970800 and endTime=160

### At this stage all necessary data has be collected, fixed, and stored as csv's. These will be the datasets used as the input to our algorithms