# Distribution Shifts

+ Consider our stock data. 
+ We are interested in testing changes in return distribution for our sample data around the time of the onset of the COVID 19 pandemic.

In [1]:
%load_ext dotenv
%dotenv 
import sys
sys.path.append("../../05_src")
from logger import get_logger
_logs = get_logger(__name__)

In [2]:
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
from glob import glob



In [3]:
ft_dir = os.getenv("FEATURES_DATA")
ft_glob = glob(ft_dir+'/*.parquet')
df = dd.read_parquet(ft_glob).compute().reset_index()

## Data Preparation

+ First, prepare four datasets, each with returns between March of a given year and March of the following year.
+ For each data set, we can compute some descriptive statistics.
+ We observe that there may be some distribution changes.

In [4]:
df_2018 = df[(df['Date'] >= '2018-03-01') & (df['Date']  < '2019-03-01')]
df_2019 = df[(df['Date'] >= '2019-03-01') & (df['Date']  < '2020-03-01')]
df_2020 = df[(df['Date'] >= '2020-03-01') & (df['Date']  < '2021-03-01')]
df_2021 = df[(df['Date'] >= '2021-03-01') & (df['Date']  < '2022-03-01')]
df_2022 = df[(df['Date'] >= '2022-03-01') & (df['Date']  < '2023-03-01')]

In [5]:
df_2018['returns'].describe()

count    121734.000000
mean          0.008215
std           0.279490
min          -0.941309
25%          -0.007968
50%           0.000912
75%           0.009286
max          30.109710
Name: returns, dtype: float64

In [6]:
df_2019['returns'].describe()

count    123583.000000
mean          0.007732
std           0.272404
min          -0.889266
25%          -0.007542
50%           0.001026
75%           0.008967
max          40.907243
Name: returns, dtype: float64

In [7]:
df_2020['returns'].describe()

count    123753.000000
mean          0.009959
std           0.287764
min          -0.876122
25%          -0.012920
50%           0.001305
75%           0.016163
max          26.043731
Name: returns, dtype: float64

In [8]:
df_2021['returns'].describe()

count    124990.000000
mean          0.013468
std           0.739445
min          -0.790283
25%          -0.008824
50%           0.000743
75%           0.010321
max         209.045513
Name: returns, dtype: float64

In [9]:
df_2022['returns'].describe()

count    124269.000000
mean          0.009327
std           0.292528
min          -0.783247
25%          -0.012277
50%           0.000170
75%           0.012647
max          46.874226
Name: returns, dtype: float64

# Komogorov-Smirnov Test

+ The KS test can be accessed via the scipy library: `scipy.stats.kstest`
+ This function can be used to perform two sample tests.
+ The null hypothesis is that the two distributions are identical.

In [10]:
from scipy.stats import kstest

kstest(df_2018['returns'].dropna(), 
       df_2019['returns'].dropna())

KstestResult(statistic=0.011889447547005272, pvalue=5.8559597523592996e-08, statistic_location=0.017902198410837622, statistic_sign=-1)

In [11]:
kstest(df_2019['returns'].dropna(), 
       df_2020['returns'].dropna())

KstestResult(statistic=0.13577941916702274, pvalue=0.0, statistic_location=0.01746190224532107, statistic_sign=1)

In [12]:
kstest(df_2020['returns'].dropna(), 
       df_2021['returns'].dropna())

KstestResult(statistic=0.10026223115521549, pvalue=0.0, statistic_location=0.017609724541885585, statistic_sign=-1)

In [13]:
kstest(df_2021['returns'].dropna(), 
       df_2022['returns'].dropna())

KstestResult(statistic=0.06208852716554075, pvalue=2.835408372305806e-209, statistic_location=-0.011759943380979188, statistic_sign=-1)