In [90]:
import pandas as pd
import quandl
import math
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.linear_model import LinearRegression

In [35]:
df = quandl.get("WIKI/GOOGL")

In [36]:
print(df.head())

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2004-08-19  100.01  104.06   95.96  100.335  44659000.0          0.0   
2004-08-20  101.01  109.08  100.50  108.310  22834300.0          0.0   
2004-08-23  110.76  113.48  109.05  109.400  18256100.0          0.0   
2004-08-24  111.24  111.60  103.57  104.870  15247300.0          0.0   
2004-08-25  104.76  108.00  103.88  106.000   9188600.0          0.0   

            Split Ratio  Adj. Open  Adj. High   Adj. Low  Adj. Close  \
Date                                                                   
2004-08-19          1.0  50.159839  52.191109  48.128568   50.322842   
2004-08-20          1.0  50.661387  54.708881  50.405597   54.322689   
2004-08-23          1.0  55.551482  56.915693  54.693835   54.869377   
2004-08-24          1.0  55.792225  55.972783  51.945350   52.597363   
2004-08-25          1.0  52.542193  54.167209  52.100830   53.1

In [37]:
df = df[["Adj. Open", "Adj. High", "Adj. Low", "Adj. Close", "Adj. Volume"]]

In [38]:
df

Unnamed: 0_level_0,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,55.792225,55.972783,51.945350,52.597363,15247300.0
2004-08-25,52.542193,54.167209,52.100830,53.164113,9188600.0
2004-08-26,52.637487,54.142132,52.492038,54.122070,7094800.0
2004-08-27,54.217364,54.478169,53.008633,53.239345,6211700.0
2004-08-30,52.802998,52.908323,51.162935,51.162935,5196700.0
2004-08-31,51.318415,52.015567,51.238167,51.343492,4917800.0
2004-09-01,51.509003,51.644421,49.989312,50.280210,9138200.0


In [39]:
# percent volatility
df["HL_PCT"] = (df["Adj. High"] - df["Adj. Close"]) / df["Adj. Close"] * 100

# percent change
df["PCT_change"] = (df["Adj. Close"] - df["Adj. Open"]) / df["Adj. Open"] * 100

In [51]:
df = df[["Adj. Close", "HL_PCT", "PCT_change", "Adj. Volume"]]

In [52]:
df.head(20)

Unnamed: 0_level_0,Adj. Close,HL_PCT,PCT_change,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,50.322842,3.712563,0.324968,44659000.0
2004-08-20,54.322689,0.710922,7.227007,22834300.0
2004-08-23,54.869377,3.729433,-1.22788,18256100.0
2004-08-24,52.597363,6.417469,-5.726357,15247300.0
2004-08-25,53.164113,1.886792,1.183658,9188600.0
2004-08-26,54.12207,0.037068,2.820391,7094800.0
2004-08-27,53.239345,2.326896,-1.803885,6211700.0
2004-08-30,51.162935,3.41143,-3.106003,5196700.0
2004-08-31,51.343492,1.308977,0.048866,4917800.0
2004-09-01,50.28021,2.713217,-2.385589,9138200.0


In [53]:
forecast_column = "Adj. Close"

In [74]:
# Fill NA/NaN values with -99999 as an outlier
df.fillna(-99999, inplace=True)

# forecasting 1% of the data based on its length
forecast_out = int(math.ceil(0.01 * len(df)))

In [75]:
# add the forecast column to the label colum
df.loc[:, ("label")] = df.loc[:, (forecast_column)]

# shift the Adjusted Close up 10% of the dataframe
df.loc[:, "label"] = df.loc[:, forecast_column].shift(-forecast_out)

In [76]:
df.head(10)

Unnamed: 0_level_0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.322842,3.712563,0.324968,44659000.0,67.739104
2004-08-20,54.322689,0.710922,7.227007,22834300.0,69.399229
2004-08-23,54.869377,3.729433,-1.22788,18256100.0,68.752232
2004-08-24,52.597363,6.417469,-5.726357,15247300.0,69.639972
2004-08-25,53.164113,1.886792,1.183658,9188600.0,69.078238
2004-08-26,54.12207,0.037068,2.820391,7094800.0,67.839414
2004-08-27,53.239345,2.326896,-1.803885,6211700.0,68.912727
2004-08-30,51.162935,3.41143,-3.106003,5196700.0,70.668146
2004-08-31,51.343492,1.308977,0.048866,4917800.0,71.219849
2004-09-01,50.28021,2.713217,-2.385589,9138200.0,72.278116


In [77]:
df.dropna(inplace=True)
df.tail(10)

Unnamed: 0_level_0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-08-02,800.12,0.274959,0.349918,1996354.0,801.23
2016-08-03,798.92,0.077605,0.307607,1461025.0,797.97
2016-08-04,797.25,0.370022,-0.124023,1076031.0,795.39
2016-08-05,806.93,0.035939,0.852383,1807271.0,799.78
2016-08-08,805.23,0.294326,-0.095533,1221609.0,805.03
2016-08-09,807.48,0.724476,0.371664,1607685.0,815.95
2016-08-10,808.49,0.295489,0.178428,918514.0,814.96
2016-08-11,808.2,0.702673,-0.280084,1282274.0,802.65
2016-08-12,807.05,0.017347,0.243451,897283.0,810.73
2016-08-15,805.96,0.670008,-0.154854,930074.0,810.06


In [89]:
X = np.array(df.drop(["label"], 1))  # features -- everything except for label
y = np.array(df.loc[:, "label"])  # labels

# Standardize a dataset along any axis
# Center to the mean and component wise scale to unit variance.
X = preprocessing.scale(X)
# take the range of X values that aren't NaN after having shifted rows up
df.dropna(inplace=True)
y = np.array(df.loc[:, "label"])

# validate that X and y data are the same length
print(len(X), len(y))

3019 3019


In [91]:
# use 20% of the data as testing data
# must test the model using data that the classifier has not seen during the testing process
# takes data, shuffles data within X and y
# train vars are used to fit the classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [112]:
# fit the training data using LinearRegression classifier
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train) # train
# Squared Error
accuracy = clf.score(X_test, y_test) # test data

clf.
print(forecast_out, accuracy)

31 0.9601162558034914


In [109]:
# using Epsilon-Support Vector Regression (skm) machine learning classification algorithm.
# Gamma:
# Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
# Current default is ‘auto’ which uses 1 / n_features, if gamma='scale' is passed then it uses 1 / (n_features * X.std()) as value of gamma.
clf = svm.SVR(kernel="sigmoid", gamma="auto")
clf.fit(X_train, y_train)  # train
# Squared Error
accuracy = clf.score(X_test, y_test)  # test data

print(forecast_out, accuracy)

31 0.8508749910553254
