In [43]:
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [44]:
stocks = pd.read_csv(r"C:\Users\cthieme\Documents\JUNK\sphist.csv")

In [45]:
stocks.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,12/7/2015,2090.419922,2090.419922,2066.780029,2077.070068,4043820000,2077.070068
1,12/4/2015,2051.23999,2093.840088,2051.23999,2091.689941,4214910000,2091.689941
2,12/3/2015,2080.709961,2085.0,2042.349976,2049.620117,4306490000,2049.620117
3,12/2/2015,2101.709961,2104.27002,2077.110107,2079.51001,3950640000,2079.51001
4,12/1/2015,2082.929932,2103.370117,2082.929932,2102.629883,3712120000,2102.629883


In [46]:
stocks["Date"] = pd.to_datetime(stocks["Date"])

In [47]:
stocks.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Volume                int64
Adj Close           float64
dtype: object

In [48]:
stocks.sort_values("Date", ascending = True, inplace = True)
stocks.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
16589,1950-01-03,16.66,16.66,16.66,16.66,1260000,16.66
16588,1950-01-04,16.85,16.85,16.85,16.85,1890000,16.85
16587,1950-01-05,16.93,16.93,16.93,16.93,2550000,16.93
16586,1950-01-06,16.98,16.98,16.98,16.98,2010000,16.98
16585,1950-01-09,17.08,17.08,17.08,17.08,2520000,17.08


In [49]:
stocks["Prior 5 Day Avg"] = stocks["Close"].shift().rolling(window = 5).mean()
stocks["Prior 30 Day Avg"] = stocks["Close"].shift().rolling(window = 30).mean()
stocks["Prior 90 Day Avg"] = stocks["Close"].shift().rolling(window = 90).mean()

In [50]:
stocks["Prior 5 Day Std Dev"] = stocks["Close"].shift().rolling(window = 5).std()
stocks["Prior 30 Day Std Dev"] = stocks["Close"].shift().rolling(window = 30).std()
stocks["Prior 90 Day Std Dev"] = stocks["Close"].shift().rolling(window = 90).std()

In [51]:
stocks["Prior 5 Day Volume Avg"] = stocks["Close"].shift().rolling(window = 5).std()
stocks["Prior 30 Volume Avg"] = stocks["Close"].shift().rolling(window = 30).std()
stocks["Prior 90 Day Volume Avg"] = stocks["Close"].shift().rolling(window = 365).std()

In [52]:
stocks.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,Prior 5 Day Avg,Prior 30 Day Avg,Prior 90 Day Avg,Prior 5 Day Std Dev,Prior 30 Day Std Dev,Prior 90 Day Std Dev,Prior 5 Day Volume Avg,Prior 30 Volume Avg,Prior 90 Day Volume Avg
16589,1950-01-03,16.66,16.66,16.66,16.66,1260000,16.66,,,,,,,,,
16588,1950-01-04,16.85,16.85,16.85,16.85,1890000,16.85,,,,,,,,,
16587,1950-01-05,16.93,16.93,16.93,16.93,2550000,16.93,,,,,,,,,
16586,1950-01-06,16.98,16.98,16.98,16.98,2010000,16.98,,,,,,,,,
16585,1950-01-09,17.08,17.08,17.08,17.08,2520000,17.08,,,,,,,,,
16584,1950-01-10,17.030001,17.030001,17.030001,17.030001,2160000,17.030001,16.9,,,0.157956,,,0.157956,,
16583,1950-01-11,17.09,17.09,17.09,17.09,2630000,17.09,16.974,,,0.089051,,,0.089051,,
16582,1950-01-12,16.76,16.76,16.76,16.76,2970000,16.76,17.022,,,0.067602,,,0.067602,,
16581,1950-01-13,16.67,16.67,16.67,16.67,3330000,16.67,16.988,,,0.134796,,,0.134796,,
16580,1950-01-16,16.719999,16.719999,16.719999,16.719999,1460000,16.719999,16.926,,,0.196545,,,0.196545,,


In [53]:
stocks = stocks[stocks["Date"]> datetime(year = 1951, month = 1, day = 2)]

In [54]:
stocks.dropna(axis = 0, inplace = True)

In [55]:
train = stocks[stocks["Date"] < datetime(year = 2013, month = 1, day = 1)]
test = stocks[stocks["Date"] >= datetime(year = 2013, month = 1, day = 1)]

In [56]:
#test to make sure I still have all my rows
print(stocks.shape)
print(train.shape)
print(test.shape)
print(train.shape[0] + test.shape[0])

(16225, 16)
(15486, 16)
(739, 16)
16225


In [57]:
stocks.isnull().sum()

Date                       0
Open                       0
High                       0
Low                        0
Close                      0
Volume                     0
Adj Close                  0
Prior 5 Day Avg            0
Prior 30 Day Avg           0
Prior 90 Day Avg           0
Prior 5 Day Std Dev        0
Prior 30 Day Std Dev       0
Prior 90 Day Std Dev       0
Prior 5 Day Volume Avg     0
Prior 30 Volume Avg        0
Prior 90 Day Volume Avg    0
dtype: int64

In [58]:
train.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close',
       'Prior 5 Day Avg', 'Prior 30 Day Avg', 'Prior 90 Day Avg',
       'Prior 5 Day Std Dev', 'Prior 30 Day Std Dev', 'Prior 90 Day Std Dev',
       'Prior 5 Day Volume Avg', 'Prior 30 Volume Avg',
       'Prior 90 Day Volume Avg'],
      dtype='object')

In [59]:
features = ['Prior 5 Day Avg', 'Prior 30 Day Avg', 'Prior 90 Day Avg',
       'Prior 5 Day Std Dev', 'Prior 30 Day Std Dev', 'Prior 90 Day Std Dev',
       'Prior 5 Day Volume Avg', 'Prior 30 Volume Avg',
       'Prior 90 Day Volume Avg']
lr = LinearRegression()
lr.fit(train[features], train["Close"])
prediction = lr.predict(test[features])
mae = mean_absolute_error(test["Close"], prediction)
mse = mean_squared_error(test["Close"], prediction)
print(f"Mean absolute error is: {mae}")
print(f"Mean squared error is: {mse}")

Mean absolute error is: 16.206168047285523
Mean squared error is: 492.4297201219029
