# Check market data

In [8]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from IPython.display import display, HTML, Markdown
from glob import glob
from time import time

# 1) Nyse stats

In [2]:
paths = glob("data/crsp/nyse/*csv")
paths.sort()

names = []
last_day = []
first_day = []


for path in tqdm(paths):
    market = pd.read_csv(path)
    name = path.split("/")[-1].split(".")[0]
    market = pd.read_csv(path)
    market = market.drop([0,1],0)
    market = market.rename(columns={"ticker":"date"})
    market.loc[:, "date"] = pd.to_datetime(market.date)
    market = market.set_index("date")
    first_day.append(market.index[0])
    last_day.append(market.index[-1])
    del market
    names.append(name)

results = pd.DataFrame(np.stack([names, first_day, last_day]).T,
                       columns=["name", "first_day", "last_day"])

100%|██████████| 2163/2163 [00:17<00:00, 123.35it/s]


## NYSE Composition of first year information for each ticker

In [3]:
first = results.first_day.map(lambda x: x.year).value_counts().to_frame().transpose()
first.index = ["percentage"]
c = list(first.columns)
c.sort()
first = first[c]
first = (first / first.sum(1)[0])
first = first.apply(lambda x: np.round(x*100,2),1)
display(HTML(first.to_html()))

Unnamed: 0,2000,2001,2002,2004,2005,2006,2007,2009,2010,2011
percentage,99.31,0.05,0.09,0.14,0.09,0.05,0.05,0.09,0.09,0.05


## NYSE Composition of last year information for each ticker

In [4]:
last = results.last_day.map(lambda x: x.year).value_counts().to_frame().transpose()
last.index = ["percentage"]
c = list(last.columns)
c.sort()
last = last[c]
last = (last / last.sum(1)[0])
last = last.apply(lambda x: np.round(x*100,2),1)
display(HTML(last.to_html()))

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
percentage,7.54,5.18,2.4,2.82,3.01,2.91,4.02,4.62,2.4,1.66,2.59,2.68,2.4,2.5,1.71,2.03,3.33,1.25,1.62,1.29,42.02


# 2) Nasdaq stats

In [5]:
paths = glob("data/crsp/nasdaq/*csv")
paths.sort()

names = []
last_day = []
first_day = []


for path in tqdm(paths):
    try:
        market = pd.read_csv(path)
        name = path.split("/")[-1].split(".")[0]
        market = pd.read_csv(path)
        market = market.drop([0,1],0)
        market = market.rename(columns={"ticker":"date"})
        market.loc[:, "date"] = pd.to_datetime(market.date)
        market = market.set_index("date")
        first_day.append(market.index[0])
        last_day.append(market.index[-1])
        del market
        names.append(name)
    except IndexError:
        pass

results = pd.DataFrame(np.stack([names, first_day, last_day]).T,
                       columns=["name", "first_day", "last_day"])

100%|██████████| 10185/10185 [01:17<00:00, 132.24it/s]


## Nasdaq Composition of first year information for each ticker

In [6]:
first = results.first_day.map(lambda x: x.year).value_counts().to_frame().transpose()
first.index = ["percentage"]
c = list(first.columns)
c.sort()
first = first[c]
first = (first / first.sum(1)[0])
first = first.apply(lambda x: np.round(x*100,2),1)
display(HTML(first.to_html()))

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
percentage,41.17,1.99,2.22,2.04,3.42,3.29,3.21,3.87,2.93,2.47,3.28,2.5,2.64,3.38,4.09,3.11,2.51,3.29,3.53,3.18,1.88


## Nasdaq Composition of last year information for each ticker

In [7]:
last = results.last_day.map(lambda x: x.year).value_counts().to_frame().transpose()
last.index = ["percentage"]
c = list(last.columns)
c.sort()
last = last[c]
last = (last / last.sum(1)[0])
last = last.apply(lambda x: np.round(x*100,2),1)
display(HTML(last.to_html()))

Unnamed: 0,2000,2001,2003,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
percentage,0.01,0.01,0.01,0.03,0.02,0.04,0.04,0.02,0.07,0.05,0.05,0.02,0.12,0.11,0.16,0.32,0.39,1.71,96.81
