In [12]:
# This notebook uses the sanitized data from download-and-sanitize.ipynb
# and visualizes its data

import os
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

def get_or_create_folder(baseDir, folderName):
    path = os.path.join(baseDir, folderName)
    if(not os.path.exists(path)):
        os.mkdir(path)
    return path

current_dir = os.getcwd()

sanitized_folder = get_or_create_folder(current_dir, 'sanitized')
processed_folder = get_or_create_folder(current_dir, 'processed')
csv_yearly = get_or_create_folder(sanitized_folder, 'cnsfails-yearly')


In [20]:
def read_csv(folder, name):
    li = []
    print(f"Reading all files in memory for pandas, this will take some time")
    for path in Path(folder).rglob(f'*{name}*.csv'):
        filename = os.path.basename(path)
        print(f"Read file {filename}")
        csv_df = pd.read_csv(path, index_col=None, header=0, sep =',', decimal='.', parse_dates=['DATE'],\
            dtype={'SYMBOL': 'str', 'QUANTITY': 'int', 'PRICE': 'float'})
        li.append(csv_df)
    
    print(f"Fles read, generating dataframe")
    df = pd.concat(li, axis=0, ignore_index=True)
    print(f"dataframe generated")
    return df

df = read_csv(csv_yearly, 'cns')
df.count()

Reading all files in memory for pandas, this will take some time
Read file cnsfails-2004.csv
Read file cnsfails-2005.csv
Read file cnsfails-2006.csv
Read file cnsfails-2007.csv
Read file cnsfails-2008.csv
Read file cnsfails-2009.csv
Read file cnsfails-2010.csv
Read file cnsfails-2011.csv
Read file cnsfails-2012.csv
Read file cnsfails-2013.csv
Read file cnsfails-2014.csv
Read file cnsfails-2015.csv
Read file cnsfails-2016.csv
Read file cnsfails-2017.csv
Read file cnsfails-2018.csv
Read file cnsfails-2019.csv
Read file cnsfails-2020.csv
Read file cnsfails-2021.csv
Fles read, generating dataframe
dataframe generated


DATE      21348881
SYMBOL    21348871
FAILS     21348881
PRICE     21348881
dtype: int64

In [31]:
# Fail value in million
df['FAILVALUE'] = (df['FAILS'] * df['PRICE']) / 1000000
df = df.sort_values(by=['FAILVALUE'], ascending=False)
df.head()

Unnamed: 0,DATE,SYMBOL,FAILS,PRICE,FAILVALUE
2159998,2007-06-27,IWM,134504697,81.74,10994.413933
2156632,2007-06-26,IWM,126547156,82.49,10438.874898
2153448,2007-06-25,IWM,95656140,82.92,7931.807129
20826571,2021-06-21,SPY,14271236,414.92,5921.421241
7879982,2011-08-15,SPY,49563253,118.15,5855.898342


In [26]:
df_symbols = df.groupby('SYMBOL').agg({'DATE':'count', 'FAILS': 'sum', 'FAILVALUE': 'sum'}).reset_index()
df_symbols = df_symbols.rename(columns={"DATE": "DAYCOUNT"})
df_symbols = df_symbols.sort_values(by=['FAILVALUE', 'FAILS'], ascending=False)
df_symbols.head()

Unnamed: 0,SYMBOL,DAYCOUNT,FAILS,FAILVALUE
49284,SPY,4230,5435541282,745208.712181
29001,IWM,4199,7996626744,571703.885762
43728,QQQ,2677,1626435681,173305.768306
16771,EEM,3718,2271069431,98170.981215
47514,SHLD,2940,1135912728,91317.7014


In [34]:
df_days = df.groupby('DATE').agg({'FAILS': 'sum', 'FAILVALUE': 'sum'}).reset_index()
df_days = df_days.sort_values(by=['DATE'])
df_days.head()

Unnamed: 0,DATE,FAILS,FAILVALUE
0,2004-03-22,771830516,0.0
1,2004-03-23,719870837,0.0
2,2004-03-24,700042286,0.0
3,2004-03-25,748808533,0.0
4,2004-03-26,953889171,0.0


In [36]:
def store_csv(df, filename, folder):
    print(f"Storing {filename} to output folder")
    fullpath = os.path.join(folder, filename)
    df.to_csv(fullpath, index=False, sep =',', decimal='.', float_format='%.5f')

store_csv(df_symbols, f"cnsfails-by-symbol.csv", processed_folder)
store_csv(df_days, f"cnsfails-by-day.csv", processed_folder)

Storing cnsfails-by-symbol.csv to output folder
Storing cnsfails-by-day.csv to output folder
