### Predicting Market Direction
---

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import statsmodels.api as sm

In [2]:
sp500_data = yf.download("^GSPC", start='2001-01-03', end='2005-12-31')
sp500_data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-02,1320.280029,1320.280029,1276.050049,1283.270020,1283.270020,1129400000
2001-01-03,1283.270020,1347.760010,1274.619995,1347.560059,1347.560059,1880700000
2001-01-04,1347.560059,1350.239990,1329.140015,1333.339966,1333.339966,2131000000
2001-01-05,1333.339966,1334.770020,1294.949951,1298.349976,1298.349976,1430800000
2001-01-08,1298.349976,1298.349976,1276.290039,1295.859985,1295.859985,1115500000
...,...,...,...,...,...,...
2005-12-23,1268.119995,1269.760010,1265.920044,1268.660034,1268.660034,1285810000
2005-12-27,1268.660034,1271.829956,1256.540039,1256.540039,1256.540039,1540470000
2005-12-28,1256.540039,1261.099976,1256.540039,1258.170044,1258.170044,1422360000
2005-12-29,1258.170044,1260.609985,1254.180054,1254.420044,1254.420044,1382540000


In [3]:
# Calculate daily returns data
df = sp500_data['Adj Close'].pct_change()*100
df

Date
2001-01-02         NaN
2001-01-03    5.009861
2001-01-04   -1.055247
2001-01-05   -2.624236
2001-01-08   -0.191781
                ...   
2005-12-23    0.042586
2005-12-27   -0.955338
2005-12-28    0.129722
2005-12-29   -0.298052
2005-12-30   -0.488672
Name: Adj Close, Length: 1256, dtype: float64

In [4]:
df = df.rename("Today").reset_index()
df

Unnamed: 0,Date,Today
0,2001-01-02,
1,2001-01-03,5.009861
2,2001-01-04,-1.055247
3,2001-01-05,-2.624236
4,2001-01-08,-0.191781
...,...,...
1251,2005-12-23,0.042586
1252,2005-12-27,-0.955338
1253,2005-12-28,0.129722
1254,2005-12-29,-0.298052


In [5]:
# Calculate the Lags
for i in range(1, 6):
    df['Lag_' + str(i)] = df['Today'].shift(i)

df

Unnamed: 0,Date,Today,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5
0,2001-01-02,,,,,,
1,2001-01-03,5.009861,,,,,
2,2001-01-04,-1.055247,5.009861,,,,
3,2001-01-05,-2.624236,-1.055247,5.009861,,,
4,2001-01-08,-0.191781,-2.624236,-1.055247,5.009861,,
...,...,...,...,...,...,...,...
1251,2005-12-23,0.042586,0.422078,0.251667,-0.023815,-0.583902,-0.284828
1252,2005-12-27,-0.955338,0.042586,0.422078,0.251667,-0.023815,-0.583902
1253,2005-12-28,0.129722,-0.955338,0.042586,0.422078,0.251667,-0.023815
1254,2005-12-29,-0.298052,0.129722,-0.955338,0.042586,0.422078,0.251667


In [6]:
# Lets get the volume of the prior day
# Divide by 1,000,000,000 to scale
df['Volume'] = sp500_data.Volume.shift(1).values/1000000000

In [7]:
df = df.dropna()
df

Unnamed: 0,Date,Today,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Volume
6,2001-01-10,0.958639,0.381219,-0.191781,-2.624236,-1.055247,5.009861,1.19130
7,2001-01-11,1.031770,0.958639,0.381219,-0.191781,-2.624236,-1.055247,1.29650
8,2001-01-12,-0.623287,1.031770,0.958639,0.381219,-0.191781,-2.624236,1.41120
9,2001-01-16,0.614309,-0.623287,1.031770,0.958639,0.381219,-0.191781,1.27600
10,2001-01-17,0.212561,0.614309,-0.623287,1.031770,0.958639,0.381219,1.20570
...,...,...,...,...,...,...,...,...
1251,2005-12-23,0.042586,0.422078,0.251667,-0.023815,-0.583902,-0.284828,1.88850
1252,2005-12-27,-0.955338,0.042586,0.422078,0.251667,-0.023815,-0.583902,1.28581
1253,2005-12-28,0.129722,-0.955338,0.042586,0.422078,0.251667,-0.023815,1.54047
1254,2005-12-29,-0.298052,0.129722,-0.955338,0.042586,0.422078,0.251667,1.42236


In [8]:
# Now add in the direction
# 1 for up and 0 for down
df['Direction'] = [1 if i > 0 else 0 for i in df['Today']]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Direction'] = [1 if i > 0 else 0 for i in df['Today']]


Unnamed: 0,Date,Today,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Volume,Direction
6,2001-01-10,0.958639,0.381219,-0.191781,-2.624236,-1.055247,5.009861,1.19130,1
7,2001-01-11,1.031770,0.958639,0.381219,-0.191781,-2.624236,-1.055247,1.29650,1
8,2001-01-12,-0.623287,1.031770,0.958639,0.381219,-0.191781,-2.624236,1.41120,0
9,2001-01-16,0.614309,-0.623287,1.031770,0.958639,0.381219,-0.191781,1.27600,1
10,2001-01-17,0.212561,0.614309,-0.623287,1.031770,0.958639,0.381219,1.20570,1
...,...,...,...,...,...,...,...,...,...
1251,2005-12-23,0.042586,0.422078,0.251667,-0.023815,-0.583902,-0.284828,1.88850,1
1252,2005-12-27,-0.955338,0.042586,0.422078,0.251667,-0.023815,-0.583902,1.28581,0
1253,2005-12-28,0.129722,-0.955338,0.042586,0.422078,0.251667,-0.023815,1.54047,1
1254,2005-12-29,-0.298052,0.129722,-0.955338,0.042586,0.422078,0.251667,1.42236,0


In [9]:
# Add a constant so that sm_api provides an intercept
df = sm.add_constant(df)
df

Unnamed: 0,const,Date,Today,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Volume,Direction
6,1.0,2001-01-10,0.958639,0.381219,-0.191781,-2.624236,-1.055247,5.009861,1.19130,1
7,1.0,2001-01-11,1.031770,0.958639,0.381219,-0.191781,-2.624236,-1.055247,1.29650,1
8,1.0,2001-01-12,-0.623287,1.031770,0.958639,0.381219,-0.191781,-2.624236,1.41120,0
9,1.0,2001-01-16,0.614309,-0.623287,1.031770,0.958639,0.381219,-0.191781,1.27600,1
10,1.0,2001-01-17,0.212561,0.614309,-0.623287,1.031770,0.958639,0.381219,1.20570,1
...,...,...,...,...,...,...,...,...,...,...
1251,1.0,2005-12-23,0.042586,0.422078,0.251667,-0.023815,-0.583902,-0.284828,1.88850,1
1252,1.0,2005-12-27,-0.955338,0.042586,0.422078,0.251667,-0.023815,-0.583902,1.28581,0
1253,1.0,2005-12-28,0.129722,-0.955338,0.042586,0.422078,0.251667,-0.023815,1.54047,1
1254,1.0,2005-12-29,-0.298052,0.129722,-0.955338,0.042586,0.422078,0.251667,1.42236,0


In [10]:
# Independent vars
X = df[['const', 'Lag_1','Lag_2','Lag_3','Lag_4','Lag_5','Volume']]

In [11]:
# Dependent variable is market diretion
y = df.Direction

In [12]:
y

6       1
7       1
8       0
9       1
10      1
       ..
1251    1
1252    0
1253    1
1254    0
1255    0
Name: Direction, Length: 1250, dtype: int64

In [13]:
# Define a Logit model
model = sm.Logit(y, X)
# Fit the model
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.691035
         Iterations 4


In [14]:
result.summary()

0,1,2,3
Dep. Variable:,Direction,No. Observations:,1250.0
Model:,Logit,Df Residuals:,1243.0
Method:,MLE,Df Model:,6.0
Date:,"Sun, 08 May 2022",Pseudo R-squ.:,0.002155
Time:,13:02:03,Log-Likelihood:,-863.79
converged:,True,LL-Null:,-865.66
Covariance Type:,nonrobust,LLR p-value:,0.713

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1268,0.241,-0.527,0.598,-0.599,0.345
Lag_1,-0.0778,0.050,-1.550,0.121,-0.176,0.021
Lag_2,-0.0390,0.050,-0.778,0.437,-0.137,0.059
Lag_3,0.0126,0.050,0.252,0.801,-0.085,0.110
Lag_4,0.0038,0.050,0.076,0.939,-0.094,0.102
Lag_5,0.0103,0.050,0.208,0.835,-0.087,0.107
Volume,0.1338,0.158,0.845,0.398,-0.177,0.444


None of the Lags are statistically significant as per p-values

In [15]:
# Predict if the market is going up or down
mkt_dir_prediction = result.predict(X)

In [16]:
mkt_dir_prediction

6       0.506256
7       0.483505
8       0.480380
9       0.515939
10      0.507901
          ...   
1251    0.519447
1252    0.505643
1253    0.539423
1254    0.523833
1255    0.517203
Length: 1250, dtype: float64

In [17]:
# Did our model perform well?
def confusion_matrix(act, pred):
    # In Logistic regresion a prediction > 0.5 is rounded up to 1
    # predicted_values_transform = ['Up' if i > 0.5 else 'Down' for i in pred]
    # actual_values = ['Up' if i > 0 else 'Down' for i in act]
    predicted_values_transform = np.where(pred > 0.5, 'Up', 'Down')
    actual_values = np.where(act > 0, 'Up', 'Down')
    # Convert dataframe to series for this to work
    confusion_matrix = pd.crosstab(pd.Series(actual_values.squeeze()), 
                                    pd.Series(predicted_values_transform.squeeze()),
                                    rownames=['Actual'],
                                    colnames=['Predicted'])
    return confusion_matrix

In [18]:
y.ndim

1

In [19]:
mkt_dir_prediction.ndim

1

In [20]:
confusion_matrix(y, mkt_dir_prediction)

Predicted,Down,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,155,448
Up,143,504


In [21]:
len(df)

1250

In [22]:
# Cprrect prediction divided by total number of observations
(155+504)/1250

0.5272

While it seems as though we are doing better than a coin toss, we are training and testing the model on the same data!


#### Split into Train and test datasets

Train data pre-2005. Test data in 2005

In [23]:
X_train = df[df.Date.dt.year < 2005][['const', 'Lag_1','Lag_2','Lag_3','Lag_4','Lag_5','Volume']]
y_train = df[df.Date.dt.year < 2005][['Direction']]
X_test = df[df.Date.dt.year == 2005][['const', 'Lag_1','Lag_2','Lag_3','Lag_4','Lag_5','Volume']]
y_test = df[df.Date.dt.year == 2005][['Direction']]

In [24]:
y_test

Unnamed: 0,Direction
1004,0
1005,0
1006,0
1007,1
1008,0
...,...
1251,1
1252,0
1253,1
1254,0


In [25]:
model = sm.Logit(y_train, X_train)

In [26]:
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.691898
         Iterations 4


In [27]:
prediction = result.predict(X_test).to_frame()
prediction

Unnamed: 0,0
1004,0.527823
1005,0.515182
1006,0.521402
1007,0.511214
1008,0.496862
...,...
1251,0.481845
1252,0.505709
1253,0.516199
1254,0.513358


In [28]:
print(y_test.ndim)
print(prediction.ndim)

2
2


In [29]:
print(y_test.squeeze().ndim)
print(prediction.squeeze().ndim)

1
1


In [30]:
confusion_matrix(y_test, prediction)

Predicted,Down,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,81,30
Up,103,38


In [31]:
# How does this perform?
(81+38)/len(X_test)

0.4722222222222222

In [32]:
# What happens if you drop less relevant variables?
X_train = df[df.Date.dt.year < 2005][['const', 'Lag_1','Lag_2']]
y_train = df[df.Date.dt.year < 2005][['Direction']]
X_test = df[df.Date.dt.year == 2005][['const', 'Lag_1','Lag_2']]
y_test = df[df.Date.dt.year == 2005][['Direction']]

In [33]:
model = sm.Logit(y_train, X_train)
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.692063
         Iterations 3


In [34]:
prediction = result.predict(X_test)

In [35]:
confusion_matrix(y_test, prediction)

Predicted,Down,Up
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,40,71
Up,37,104


In [36]:
(40 + 104)/len(X_test)

0.5714285714285714

#### We do better off by only taking the returns on the previous two days!