# Predicting Stocks Exercise

Exercise in using Python Machine Learning to predict stock prices, using a dataset from 1950-2012 to make prediciton from 2013-2015.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Reading the dataset
data = pd.read_csv('sphist.csv')

## Converting the date column to datetime
data['Date'] = pd.to_datetime(data['Date'])

## Sorting all rows by the Date column
data = data.sort_values('Date', ascending=True)

## Display the first 10 rows
print(data.head(10))

## Display the last 10 rows
print(data.tail(10))

## Display all columns
pd.options.display.max_columns = None

            Date       Open       High        Low      Close     Volume  \
16589 1950-01-03  16.660000  16.660000  16.660000  16.660000  1260000.0   
16588 1950-01-04  16.850000  16.850000  16.850000  16.850000  1890000.0   
16587 1950-01-05  16.930000  16.930000  16.930000  16.930000  2550000.0   
16586 1950-01-06  16.980000  16.980000  16.980000  16.980000  2010000.0   
16585 1950-01-09  17.080000  17.080000  17.080000  17.080000  2520000.0   
16584 1950-01-10  17.030001  17.030001  17.030001  17.030001  2160000.0   
16583 1950-01-11  17.090000  17.090000  17.090000  17.090000  2630000.0   
16582 1950-01-12  16.760000  16.760000  16.760000  16.760000  2970000.0   
16581 1950-01-13  16.670000  16.670000  16.670000  16.670000  3330000.0   
16580 1950-01-16  16.719999  16.719999  16.719999  16.719999  1460000.0   

       Adj Close  
16589  16.660000  
16588  16.850000  
16587  16.930000  
16586  16.980000  
16585  17.080000  
16584  17.030001  
16583  17.090000  
16582  16.760000  
165

## Generating indicators

Creating new columns with rolling averages and standard deviations to generate indicators.

In [2]:
## Creating columns for rolling averages of the closing price (shifted down one row)
data['5_close_avg'] = data['Close'].rolling(5).mean().shift(1)
data['30_close_avg'] = data['Close'].rolling(30).mean().shift(1)
data['365_close_avg'] = data['Close'].rolling(365).mean().shift(1)

## Creating columns for the rolling STD of the closing price (shifted down one row)
data['5_close_std'] = data['Close'].rolling(5).std().shift(1)
data['30_close_std'] = data['Close'].rolling(30).std().shift(1)
data['365_close_std'] = data['Close'].rolling(365).std().shift(1)

## Finding the Ratio between 5 day average to 365 average closing price
data['ratio_5_365_avg_close'] = data['5_close_avg'] / data['365_close_avg']
data['ratio_5_365_std_close'] = data['5_close_std'] / data['365_close_std']

## Creating a column for rolling averages of the volume (shifted down one row)
data['5_volume_avg'] = data['Volume'].rolling(5).mean().shift(1)
data['30_volume_avg'] = data['Volume'].rolling(30).mean().shift(1)
data['365_volume_avg'] = data['Volume'].rolling(365).mean().shift(1)

## Creating a column for the rolling STD of the closing price (shifted down one row)
data['5_volume_std'] = data['Volume'].rolling(5).std().shift(1)
data['30_volume_std'] = data['Volume'].rolling(30).std().shift(1)
data['365_volume_std'] = data['Volume'].rolling(365).std().shift(1)

## Finding the Ratio between 5 day average to 365 average Volume
data['ratio_5_365_avg_volume'] = data['5_volume_avg'] / data['365_volume_avg']
data['ratio_5_365_std_volume'] = data['5_volume_std'] / data['365_volume_std']

## Display the last ten rows
data.tail(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,5_close_avg,30_close_avg,365_close_avg,5_close_std,30_close_std,365_close_std,ratio_5_365_avg_close,ratio_5_365_std_close,5_volume_avg,30_volume_avg,365_volume_avg,5_volume_std,30_volume_std,365_volume_std,ratio_5_365_avg_volume,ratio_5_365_std_volume
9,2015-11-23,2089.409912,2095.610107,2081.389893,2086.590088,3587980000.0,2086.590088,2071.523974,2061.892989,2033.60589,18.24694,32.699325,64.911334,1.018646,0.281106,3930538000.0,3883640000.0,3523622000.0,305833600.0,402240600.0,687310800.0,1.115482,0.444971
8,2015-11-24,2084.419922,2094.120117,2070.290039,2089.139893,3884930000.0,2089.139893,2078.204004,2064.197327,2034.018028,15.807754,31.885833,64.768328,1.021723,0.244066,3899886000.0,3906798000.0,3526334000.0,335764500.0,361156200.0,685600100.0,1.105932,0.489738
7,2015-11-25,2089.300049,2093.0,2086.300049,2088.870117,2852940000.0,2088.870117,2085.943994,2067.045658,2034.432712,3.491188,30.05861,64.634873,1.02532,0.054014,3791402000.0,3922898000.0,3528961000.0,168902900.0,348414200.0,685130800.0,1.074368,0.246526
6,2015-11-27,2088.820068,2093.290039,2084.129883,2090.110107,1466840000.0,2090.110107,2087.002002,2070.199996,2034.835123,3.395982,26.960525,64.514871,1.025637,0.052639,3576712000.0,3896510000.0,3528637000.0,431897800.0,396834300.0,685423300.0,1.013624,0.630118
5,2015-11-30,2090.949951,2093.810059,2080.409912,2080.409912,4245030000.0,2080.409912,2088.776025,2072.408333,2035.199864,1.309055,25.718597,64.4498,1.026325,0.020311,3144458000.0,3820528000.0,3524258000.0,1032091000.0,595221700.0,693451100.0,0.892233,1.488339
4,2015-12-01,2082.929932,2103.370117,2082.929932,2102.629883,3712120000.0,2102.629883,2087.024023,2073.984998,2035.531178,3.916109,24.654181,64.370261,1.025297,0.060837,3207544000.0,3842181000.0,3527800000.0,1099518000.0,598557000.0,693822700.0,0.909219,1.584724
3,2015-12-02,2101.709961,2104.27002,2077.110107,2079.51001,3950640000.0,2079.51001,2090.231982,2076.283993,2035.914082,7.956808,23.970453,64.352527,1.02668,0.123644,3232372000.0,3856341000.0,3526090000.0,1111591000.0,589940800.0,692592800.0,0.916702,1.60497
2,2015-12-03,2080.709961,2085.0,2042.349976,2049.620117,4306490000.0,2049.620117,2088.306006,2077.908659,2036.234356,9.333599,22.378095,64.277554,1.025573,0.145208,3245514000.0,3876979000.0,3529468000.0,1121578000.0,581719500.0,691645100.0,0.919548,1.62161
1,2015-12-04,2051.23999,2093.840088,2051.23999,2091.689941,4214910000.0,2091.689941,2080.456006,2078.931331,2036.507343,19.599946,20.183769,64.121622,1.02158,0.305668,3536224000.0,3899603000.0,3532802000.0,1181180000.0,584883100.0,692451500.0,1.000969,1.705795
0,2015-12-07,2090.419922,2090.419922,2066.780029,2077.070068,4043820000.0,2077.070068,2080.771973,2080.237329,2036.869425,19.806136,19.676415,64.058862,1.021554,0.309187,4085838000.0,3892405000.0,3535838000.0,249315400.0,579423000.0,693007500.0,1.15555,0.359759


## Splitting the Data

Split the dataset into a train set (for training the algorithm for predictions) and a test set (to test those predictions).

In [3]:
## Removing any dates before 01/02/1951 since we're using 365 day rolling indicators
data = data[data['Date'] > datetime(year=1951, month=1, day=2)]

## Removing any rows with null values
data = data.dropna(axis=0)

data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,5_close_avg,30_close_avg,365_close_avg,5_close_std,30_close_std,365_close_std,ratio_5_365_avg_close,ratio_5_365_std_close,5_volume_avg,30_volume_avg,365_volume_avg,5_volume_std,30_volume_std,365_volume_std,ratio_5_365_avg_volume,ratio_5_365_std_volume
16224,1951-06-19,22.02,22.02,22.02,22.02,1100000.0,22.02,21.8,21.703333,19.447726,0.256223,0.473595,1.790253,1.120954,0.143121,1196000.0,1707667.0,1989479.0,142232.204511,1566790.0,772310.649463,0.601162,0.184164
16223,1951-06-20,21.91,21.91,21.91,21.91,1120000.0,21.91,21.9,21.683,19.462411,0.213659,0.444648,1.789307,1.125246,0.119409,1176000.0,1691667.0,1989041.0,148425.065269,1570585.0,772771.102512,0.59124,0.192069
16222,1951-06-21,21.780001,21.780001,21.780001,21.780001,1100000.0,21.780001,21.972,21.659667,19.476274,0.092574,0.411452,1.788613,1.128142,0.051758,1188000.0,1675667.0,1986932.0,138816.425541,1573993.0,774092.100761,0.597907,0.179328
16221,1951-06-22,21.549999,21.549999,21.549999,21.549999,1340000.0,21.549999,21.96,21.631,19.489562,0.115108,0.368514,1.787659,1.126757,0.06439,1148000.0,1647000.0,1982959.0,126767.503722,1576465.0,774914.749625,0.578933,0.163589
16220,1951-06-25,21.290001,21.290001,21.290001,21.290001,2440000.0,21.290001,21.862,21.599,19.502082,0.204132,0.32913,1.786038,1.121008,0.114293,1142000.0,1636333.0,1981123.0,113666.177907,1577456.0,775643.72318,0.576441,0.146544


In [4]:
## Create a function for training and testing the data
def train_and_test(data, features):
    ## Split the data into train and test
    train = data[data['Date'] < datetime(year=2013, month=1, day=1)]
    test = data[data['Date'] >= datetime(year=2013, month=1, day=1)]
    
    ## Initialize a linear regression model
    lr = LinearRegression()
    
    ## Target is the close price
    target = 'Close'
    
    ## Train 
    lr.fit(train[features], train[target])
    
    ## Test
    predictions = lr.predict(test[features])
    
    ## Calculate the errors and return the rmse
    mse = mean_squared_error(test[target], predictions)
    rmse = np.sqrt(mse)
    return rmse

In [5]:
## Try predicting the close based on the rolling averages
features = ['5_close_avg', '30_close_avg', '365_close_avg']
rmse = train_and_test(data, features)
rmse

22.22006532421962

In [6]:
## Show all indicators
features = ['5_close_avg', '30_close_avg', '365_close_avg']
rmse = train_and_test(data, features)
print('Rolling Average Prices RMSE: ', rmse)

features = ['5_close_avg', '30_close_avg', '365_close_avg', 'ratio_5_365_avg_close', 'ratio_5_365_std_close']
rmse = train_and_test(data, features)
print('Rolling Average Prices and Ratio RMSE: ', rmse)

features = ['5_close_avg', '30_close_avg', '365_close_avg', '5_close_std', '30_close_std', '365_close_std', 'ratio_5_365_avg_close', 'ratio_5_365_std_close']
rmse = train_and_test(data, features)
print('Rolling Average Prices, Ratio, and STD RMSE: ', rmse)

features = ['5_volume_avg', '30_volume_avg', '365_volume_avg']
rmse = train_and_test(data, features)
print('Rolling Average Volume RMSE: ', rmse)

features = ['5_volume_avg', '30_volume_avg', '365_volume_avg', 'ratio_5_365_avg_volume', 'ratio_5_365_std_volume']
rmse = train_and_test(data, features)
print('Rolling Average Volume and Ratio RMSE: ', rmse)

features = ['5_volume_avg', '30_volume_avg', '365_volume_avg', '5_volume_std', '30_volume_std', '365_volume_std', 'ratio_5_365_avg_volume', 'ratio_5_365_std_volume']
rmse = train_and_test(data, features)
print('Rolling Average Volume, Ratio, and STD RMSE: ', rmse)

features = ['5_close_avg', '30_close_avg', '365_close_avg', '5_close_std', '30_close_std', '365_close_std', 'ratio_5_365_avg_close', 'ratio_5_365_std_close', '5_volume_avg', '30_volume_avg', '365_volume_avg', '5_volume_std', '30_volume_std', '365_volume_std', 'ratio_5_365_avg_volume', 'ratio_5_365_std_volume']
rmse = train_and_test(data, features)
print('All RMSE: ', rmse)

Rolling Average Prices RMSE:  22.22006532421962
Rolling Average Prices and Ratio RMSE:  22.208709181564682
Rolling Average Prices, Ratio, and STD RMSE:  22.20742791793938
Rolling Average Volume RMSE:  734.2049783075411
Rolling Average Volume and Ratio RMSE:  730.4823156094525
Rolling Average Volume, Ratio, and STD RMSE:  378.2631970459332
All RMSE:  22.28212614252765


In [7]:
## Show the correlation of each column to closing price
data.corr()['Close']

Open                      0.999900
High                      0.999953
Low                       0.999956
Close                     1.000000
Volume                    0.772817
Adj Close                 1.000000
5_close_avg               0.999793
30_close_avg              0.999189
365_close_avg             0.988870
5_close_std               0.722414
30_close_std              0.778583
365_close_std             0.816103
ratio_5_365_avg_close     0.047782
ratio_5_365_std_close     0.087018
5_volume_avg              0.780896
30_volume_avg             0.786908
365_volume_avg            0.784878
5_volume_std              0.617248
30_volume_std             0.684340
365_volume_std            0.684141
ratio_5_365_avg_volume   -0.012305
ratio_5_365_std_volume    0.070329
Name: Close, dtype: float64

## Indicator Results
None of our indicators has a strong predictive value, but rolling average prices are the closest. Volume has a negative impact on predictions (even when included with all). 

Let's look at a shorter time scale by writing a function for predicting the next day's price.

In [8]:
def next_day(data, features, row):
    ## Split into Train and Test
    train = data[data['Date'] < row['Date']]
    test = data[data['Date'] == row['Date']]
    
    ## If the train dataset is null then return null
    if len(train) == 0:
        return np.nan
    
    ## Otherwise run the function and return the RMSE
    else:
        ## Initialize a linear regression model
        lr = LinearRegression()
            
        ## Target is the close price
        target = 'Close'
            
        ## Train 
        lr.fit(train[features], train[target])
            
        ## Test
        predictions = lr.predict(test[features])
            
        ## Calculate the errors and return the rmse
        mse = mean_squared_error(test[target], predictions)
        rmse = np.sqrt(mse)
            
        return rmse

In [9]:
## Test the function with all of the rolling price indicators

features = ['5_close_avg', '30_close_avg', '365_close_avg', '5_close_std', '30_close_std', '365_close_std', 'ratio_5_365_avg_close', 'ratio_5_365_std_close']
rmses = data.apply(lambda row: next_day(data, features, row), axis=1)
rmse = np.mean(rmses)
print('Rolling Average Prices, Ratio, and STD RMSE: ', rmse)

Rolling Average Prices, Ratio, and STD RMSE:  5.489441399575582


In [10]:
## Test the function with all  indicators
features = ['5_close_avg', '30_close_avg', '365_close_avg', '5_close_std', '30_close_std', '365_close_std', 'ratio_5_365_avg_close', 'ratio_5_365_std_close', '5_volume_avg', '30_volume_avg', '365_volume_avg', '5_volume_std', '30_volume_std', '365_volume_std', 'ratio_5_365_avg_volume', 'ratio_5_365_std_volume']
rmses = data.apply(lambda row: next_day(data, features, row), axis=1)
rmse = np.mean(rmses)
print('All RMSE: ', rmse)

All RMSE:  5.4949229707600145


## Conclusions
Predictions vastly improved on shorter time scale. Rolling price averages are the best indicators to use (volume only seems to make the predictions worse). 