# Predict S&P500 price

Using daily record of prices from 1950 to 2012 to predict prices from 2013 to 2015.

The columns of the dataset are:

- `Date` -- The date of the record.
- `Open` -- The opening price of the day (when trading starts).
- `High` -- The highest trade price during the day.
- `Low` -- The lowest trade price during the day.
- `Close` -- The closing price for the day (when trading is finished).
- `Volume` -- The number of shares traded.
- `Adj Close` -- The daily closing price, adjusted retroactively to include any corporate actions.

In [49]:
import functools
import datetime

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

df = pd.read_csv('sphist.csv', parse_dates=['Date'], index_col='Date')
df = df.sort_index()

df.info()
df

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 16590 entries, 1950-01-03 to 2015-12-07
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       16590 non-null  float64
 1   High       16590 non-null  float64
 2   Low        16590 non-null  float64
 3   Close      16590 non-null  float64
 4   Volume     16590 non-null  float64
 5   Adj Close  16590 non-null  float64
dtypes: float64(6)
memory usage: 907.3 KB


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1950-01-03,16.660000,16.660000,16.660000,16.660000,1.260000e+06,16.660000
1950-01-04,16.850000,16.850000,16.850000,16.850000,1.890000e+06,16.850000
1950-01-05,16.930000,16.930000,16.930000,16.930000,2.550000e+06,16.930000
1950-01-06,16.980000,16.980000,16.980000,16.980000,2.010000e+06,16.980000
1950-01-09,17.080000,17.080000,17.080000,17.080000,2.520000e+06,17.080000
...,...,...,...,...,...,...
2015-12-01,2082.929932,2103.370117,2082.929932,2102.629883,3.712120e+09,2102.629883
2015-12-02,2101.709961,2104.270020,2077.110107,2079.510010,3.950640e+09,2079.510010
2015-12-03,2080.709961,2085.000000,2042.349976,2049.620117,4.306490e+09,2049.620117
2015-12-04,2051.239990,2093.840088,2051.239990,2091.689941,4.214910e+09,2091.689941


## Feature engineering



In [68]:

yX = pd.DataFrame(index=df.index)

yX['target'] = df['Close']

# f1: Open price.
yX['f1'] = df['Open']

# f2-6: Close price for last five days.
for i in range(1, 6):
    yX[f'f{i+1}'] = df['Close'].shift(i)

# f7-9: Price rolling averages.
yX['f7'] = df['Close'].rolling(5).mean().shift()
yX['f8'] = df['Close'].rolling(30).mean().shift()
yX['f9'] = df['Close'].rolling(365).mean().shift()

# f10-12: Volume rolling averages.
yX['f10'] = df['Volume'].rolling(5).mean().shift()
yX['f11'] = df['Volume'].rolling(30).mean().shift()
yX['f12'] = df['Volume'].rolling(365).mean().shift()

# f13-14 Change in averages.
yX['f13'] = yX['f7']/yX['f9']
yX['f14'] = yX['f10']/yX['f12']

# f10-12: Volume rolling averages.
yX['f15'] = df['Close'].rolling(5).std().shift()
yX['f16'] = df['Close'].rolling(30).std().shift()
yX['f17'] = df['Close'].rolling(365).std().shift()

# f10-12: Volume rolling averages.
yX['f18'] = df['Volume'].rolling(5).std().shift()
yX['f19'] = df['Volume'].rolling(30).std().shift()
yX['f20'] = df['Volume'].rolling(365).std().shift()

yX['f20'] = yX.index.year

yX = yX.dropna(axis=0)
yX

Unnamed: 0_level_0,target,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1951-06-19,22.020000,22.020000,22.049999,22.040001,21.840000,21.549999,21.520000,21.800000,21.703333,19.447726,...,1.707667e+06,1.989479e+06,1.120954,0.601162,0.256223,0.473595,1.790253,1.422322e+05,1.566790e+06,1951
1951-06-20,21.910000,21.910000,22.020000,22.049999,22.040001,21.840000,21.549999,21.900000,21.683000,19.462411,...,1.691667e+06,1.989041e+06,1.125246,0.591240,0.213659,0.444648,1.789307,1.484251e+05,1.570585e+06,1951
1951-06-21,21.780001,21.780001,21.910000,22.020000,22.049999,22.040001,21.840000,21.972000,21.659667,19.476274,...,1.675667e+06,1.986932e+06,1.128142,0.597907,0.092574,0.411452,1.788613,1.388164e+05,1.573993e+06,1951
1951-06-22,21.549999,21.549999,21.780001,21.910000,22.020000,22.049999,22.040001,21.960000,21.631000,19.489562,...,1.647000e+06,1.982959e+06,1.126757,0.578933,0.115108,0.368514,1.787659,1.267675e+05,1.576465e+06,1951
1951-06-25,21.290001,21.290001,21.549999,21.780001,21.910000,22.020000,22.049999,21.862000,21.599000,19.502082,...,1.636333e+06,1.981123e+06,1.121008,0.576441,0.204132,0.329130,1.786038,1.136662e+05,1.577456e+06,1951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-01,2102.629883,2082.929932,2080.409912,2090.110107,2088.870117,2089.139893,2086.590088,2087.024023,2073.984998,2035.531178,...,3.842181e+09,3.527800e+09,1.025297,0.909219,3.916109,24.654181,64.370261,1.099518e+09,5.985570e+08,2015
2015-12-02,2079.510010,2101.709961,2102.629883,2080.409912,2090.110107,2088.870117,2089.139893,2090.231982,2076.283993,2035.914082,...,3.856341e+09,3.526090e+09,1.026680,0.916702,7.956808,23.970453,64.352527,1.111591e+09,5.899408e+08,2015
2015-12-03,2049.620117,2080.709961,2079.510010,2102.629883,2080.409912,2090.110107,2088.870117,2088.306006,2077.908659,2036.234356,...,3.876979e+09,3.529468e+09,1.025573,0.919548,9.333599,22.378095,64.277554,1.121578e+09,5.817195e+08,2015
2015-12-04,2091.689941,2051.239990,2049.620117,2079.510010,2102.629883,2080.409912,2090.110107,2080.456006,2078.931331,2036.507343,...,3.899603e+09,3.532802e+09,1.021580,1.000969,19.599946,20.183769,64.121622,1.181180e+09,5.848831e+08,2015


In [69]:
#test, train = train_test_split(yX, test_size=0.4, shuffle=True)
train = yX.loc[:'2013-01-01']
test = yX.loc['2013-01-01':]

model = LinearRegression().fit(train.loc[:, 'f1':], train['target'])
predictions = model.predict(test.loc[:, 'f1':])

mean_squared_error(predictions, test['target'])

170.74867353315597