## Importing Packages

In [2]:
import pandas as pd
import numpy as np
import statsmodels.graphics.tsaplots as sgt
import statsmodels.tsa.stattools as sts
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import scipy.stats
sns.set()

import warnings
warnings.filterwarnings('ignore')

## Loading and Transforming Data¶

In [3]:
# Importing the Data
file_name = Path('./Data/Index2018.csv')
df = pd.read_csv(file_name, parse_dates=True, index_col='date', infer_datetime_format=True, dayfirst=True) 
df.head()

Unnamed: 0_level_0,spx,dax,ftse,nikkei
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-07,469.9,2224.95,3445.98,18124.01
1994-01-10,475.27,2225.0,3440.58,18443.44
1994-01-11,474.13,2228.1,3413.77,18485.25
1994-01-12,474.17,2182.06,3372.02,18793.88
1994-01-13,472.47,2142.37,3360.01,18577.26


In [4]:
df = df.asfreq('b')
df.head()

Unnamed: 0_level_0,spx,dax,ftse,nikkei
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-07,469.9,2224.95,3445.98,18124.01
1994-01-10,475.27,2225.0,3440.58,18443.44
1994-01-11,474.13,2228.1,3413.77,18485.25
1994-01-12,474.17,2182.06,3372.02,18793.88
1994-01-13,472.47,2142.37,3360.01,18577.26


In [5]:
df = df.fillna(method='ffill')
df.head()

Unnamed: 0_level_0,spx,dax,ftse,nikkei
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-07,469.9,2224.95,3445.98,18124.01
1994-01-10,475.27,2225.0,3440.58,18443.44
1994-01-11,474.13,2228.1,3413.77,18485.25
1994-01-12,474.17,2182.06,3372.02,18793.88
1994-01-13,472.47,2142.37,3360.01,18577.26


## S&P500 Data Only

In [6]:
# SPX data
spx_data = df['spx'].to_frame()

# Train and Test Split
train_size = int(len(spx_data)*0.8)
train_spx = spx_data.iloc[:train_size]
test_spx = spx_data.iloc[train_size:]

## White Noise

In [7]:
wn_data = np.random.normal(loc=train_spx.mean(), scale=train_spx.std(), size=len(train_spx))
train_spx['wn'] = wn_data

## Random Walk

In [8]:
rw_data = pd.read_csv(Path('./Data/RandWalk.csv'), parse_dates=True, infer_datetime_format=True, index_col='date', dayfirst=True)
rw_data = rw_data.asfreq('b')
train_spx['rw'] = rw_data

# Stationarity

Dickey-Fuller Test

H0 >> Non-Stationarity = One Lag Autocorrelation Coefficient < 1

H1 >> Stationarity = One Lag Autocorrelation Coefficient = 1

If Test statistic < critical value => Stationarity (Reject the Null Hipothesis H0)

#### Running a Test For SPX Data

In [10]:
sts.adfuller(train_spx['spx'])

(-1.7369847452352367,
 0.4121645696770657,
 18,
 5002,
 {'1%': -3.431658008603046,
  '5%': -2.862117998412982,
  '10%': -2.567077669247375},
 39904.880607487445)

Observations

1. Test Statistic = -1.7369847452352367
2. P-Value = 0.4121645696770657
3. Num of Lags = 18
4. Num observations used for ADF Regression and Critical values = 5002
5. Critical Values at 1% = -3.431658008603046 / 5% = -2.862117998412982 / 10% = -2.567077669247375
6. The maximized information criterion = 39904.880607487445

Here Test Statictics (-1.7369847452352367) > (-3.431658008603046 and -2.862117998412982 and -2.567077669247375)

>> So, we CANNOT REJECT the Null Hypothesis. SPX data is Non-Stationary 

P-value of 0.412 means there is a 41.2% chance that we CANNOT REJECT the Null Hypothesis or come from Non-Stationary process

The maximized information criterion LOWER the Value it is easier to make predictions

#### Running a Test For White Noise Data

In [14]:
sts.adfuller(train_spx['wn'])

(-70.97414852893955,
 0.0,
 0,
 5020,
 {'1%': -3.431653316130827,
  '5%': -2.8621159253018247,
  '10%': -2.5670765656497516},
 70779.9682875108)

Observations

1. Test Statistic = -70.97414852893955
2. P-Value = 0.0
3. Num of Lags = 0
4. Num observations used for ADF Regression and Critical values = 5020
5. Critical Values at 1% = -3.431653316130827 / 5% = -2.8621159253018247 / 10% = -2.5670765656497516
6. The maximized information criterion = 70779.9682875108

Here Test Statictics (-70.97414852893955) < (-3.431653316130827 and 2.8621159253018247 and -2.5670765656497516)

>> So, we REJECT the Null Hypothesis. White Noise data is Stationary as we expected

P-value is 0 means 0% chance it comes from Non-stationary process or 100% comes from Stationary process

The maximized information criterion Much HIGHER the Value it is Harder to make predictions

#### Running a Test For Random Walk Data

In [16]:
sts.adfuller(train_spx['rw'])

(-1.3286073927689708,
 0.6159849181617388,
 24,
 4996,
 {'1%': -3.4316595802782865,
  '5%': -2.8621186927706463,
  '10%': -2.567078038881065},
 46299.333497595144)

Observations

1. Test Statistic = -1.3286073927689708
2. P-Value = 0.6159849181617388
3. Num of Lags = 24
4. Num observations used for ADF Regression and Critical values = 4996
5. Critical Values at 1% = -3.4316595802782865 / 5% = -2.8621186927706463 / 10% = -2.567078038881065
6. The maximized information criterion = 46299.333497595144

Here Test Statictics (-1.3286073927689708) > (-3.4316595802782865 and 2.8621159253018247 and -2.5670765656497516)

>> So, we CANNOT REJECT the Null Hypothesis. Random Walk data is Non-Stationary as we expected

P-value is 0.615 which means 61.5% chance it comes from Non-Stationary process

In reality, The covariances of two such intervals with identical size would very rarely be equal

We expect Random Walks to be non-stationary process