In [1]:
import pandas as pd 
import quandl
import math
import numpy as np
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression



In [2]:
df = quandl.get('WIKI/GOOGL') 

In [3]:
print(df.head())

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2004-08-19  100.01  104.06   95.96  100.335  44659000.0          0.0   
2004-08-20  101.01  109.08  100.50  108.310  22834300.0          0.0   
2004-08-23  110.76  113.48  109.05  109.400  18256100.0          0.0   
2004-08-24  111.24  111.60  103.57  104.870  15247300.0          0.0   
2004-08-25  104.76  108.00  103.88  106.000   9188600.0          0.0   

            Split Ratio  Adj. Open  Adj. High   Adj. Low  Adj. Close  \
Date                                                                   
2004-08-19          1.0  50.159839  52.191109  48.128568   50.322842   
2004-08-20          1.0  50.661387  54.708881  50.405597   54.322689   
2004-08-23          1.0  55.551482  56.915693  54.693835   54.869377   
2004-08-24          1.0  55.792225  55.972783  51.945350   52.597363   
2004-08-25          1.0  52.542193  54.167209  52.100830   53.1

In [4]:
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
print(df.head())

            Adj. Open  Adj. High   Adj. Low  Adj. Close  Adj. Volume
Date                                                                
2004-08-19  50.159839  52.191109  48.128568   50.322842   44659000.0
2004-08-20  50.661387  54.708881  50.405597   54.322689   22834300.0
2004-08-23  55.551482  56.915693  54.693835   54.869377   18256100.0
2004-08-24  55.792225  55.972783  51.945350   52.597363   15247300.0
2004-08-25  52.542193  54.167209  52.100830   53.164113    9188600.0


In [5]:
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
print(df.head())

            Adj. Open  Adj. High   Adj. Low  Adj. Close  Adj. Volume    HL_PCT
Date                                                                          
2004-08-19  50.159839  52.191109  48.128568   50.322842   44659000.0  3.712563
2004-08-20  50.661387  54.708881  50.405597   54.322689   22834300.0  0.710922
2004-08-23  55.551482  56.915693  54.693835   54.869377   18256100.0  3.729433
2004-08-24  55.792225  55.972783  51.945350   52.597363   15247300.0  6.417469
2004-08-25  52.542193  54.167209  52.100830   53.164113    9188600.0  1.886792


In [6]:
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Close'] * 100.0
print(df.head())

            Adj. Open  Adj. High   Adj. Low  Adj. Close  Adj. Volume  \
Date                                                                   
2004-08-19  50.159839  52.191109  48.128568   50.322842   44659000.0   
2004-08-20  50.661387  54.708881  50.405597   54.322689   22834300.0   
2004-08-23  55.551482  56.915693  54.693835   54.869377   18256100.0   
2004-08-24  55.792225  55.972783  51.945350   52.597363   15247300.0   
2004-08-25  52.542193  54.167209  52.100830   53.164113    9188600.0   

              HL_PCT  PCT_change  
Date                              
2004-08-19  3.712563    0.323915  
2004-08-20  0.710922    6.739913  
2004-08-23  3.729433   -1.243144  
2004-08-24  6.417469   -6.074187  
2004-08-25  1.886792    1.169811  


In [7]:
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
print(df.head())

            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2004-08-19   50.322842  3.712563    0.323915   44659000.0
2004-08-20   54.322689  0.710922    6.739913   22834300.0
2004-08-23   54.869377  3.729433   -1.243144   18256100.0
2004-08-24   52.597363  6.417469   -6.074187   15247300.0
2004-08-25   53.164113  1.886792    1.169811    9188600.0


In [8]:
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True)

In [9]:
forecast_out = int(math.ceil(0.01*len(df)))
print(forecast_out)
len(df)

35


3424

In [10]:
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
print(df.head())
print(df.tail())

            Adj. Close    HL_PCT  PCT_change  Adj. Volume      label
Date                                                                
2004-08-19   50.322842  3.712563    0.323915   44659000.0  69.078238
2004-08-20   54.322689  0.710922    6.739913   22834300.0  67.839414
2004-08-23   54.869377  3.729433   -1.243144   18256100.0  68.912727
2004-08-24   52.597363  6.417469   -6.074187   15247300.0  70.668146
2004-08-25   53.164113  1.886792    1.169811    9188600.0  71.219849
            Adj. Close    HL_PCT  PCT_change  Adj. Volume    label
Date                                                              
2018-01-30     1177.37  0.896914   -0.029727    1792602.0  1094.00
2018-01-31     1182.22  0.346805   -0.134493    1643877.0  1053.15
2018-02-01     1181.59  0.495942    0.473938    2774967.0  1026.55
2018-02-02     1119.20  1.081129   -0.734453    5798880.0  1054.09
2018-02-05     1068.76  4.325574   -2.980089    3742469.0  1006.94


In [11]:
X = np.array(df.drop(['label'], 1))
y = np.array(df['label'])
print("x")
print(X)
print("y")
print(y)

x
[[ 5.03228418e+01  3.71256291e+00  3.23914885e-01  4.46590000e+07]
 [ 5.43226889e+01  7.10922353e-01  6.73991321e+00  2.28343000e+07]
 [ 5.48693765e+01  3.72943327e+00 -1.24314442e+00  1.82561000e+07]
 ...
 [ 1.18159000e+03  4.95941909e-01  4.73937660e-01  2.77496700e+06]
 [ 1.11920000e+03  1.08112938e+00 -7.34453181e-01  5.79888000e+06]
 [ 1.06876000e+03  4.32557356e+00 -2.98008908e+00  3.74246900e+06]]
y
[  69.0782379    67.83941377   68.91272699 ... 1026.55       1054.09
 1006.94      ]


In [13]:
X = preprocessing.scale(X)
print(X)

[[-1.41006544  2.35025798  0.2449376   4.44790502]
 [-1.39402696 -0.30866295  4.50625488  1.80881376]
 [-1.39183486  2.36520212 -0.79585708  1.25520764]
 ...
 [ 3.12606095 -0.49909747  0.34457831 -0.61680539]
 [ 2.87589111  0.01927479 -0.45799933 -0.25114711]
 [ 2.67363808  2.89327664 -1.94948461 -0.4998129 ]]


In [14]:
#x = x[:-forecast_out+1]

In [15]:
y = np.array(df['label'])
print(len(X),len(y))

3389 3389


In [16]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [17]:
#linear regression
clf = LinearRegression()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)

0.9769566976026005


In [18]:
#suport vector machine
clf = svm.SVR()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)

0.7670628809373439
