In [1]:
import pandas as pd
import quandl, math, datetime
import numpy as np
from sklearn import preprocessing, model_selection, svm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style
import pickle

In [None]:
df = quandl.get('WIKI/GOOGL')
df2 = quandl.get('WIKI/GOOGL')

In [None]:
df.head()

In machine learning the columns are called 'features'. We want meaningfull features especially for linear regression. LR is not super depth so the corellations between features have to be passed by us and base on that it will make predictions.

In [None]:
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

In [None]:
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]

In [None]:
df.head()

Features are attributes that make up the label and the label is some sort of prediction into the future.

In [None]:
df.isna().sum()

We do not want missing values and sometimes the best approach to this is to fill them with outliers.

In [None]:
df.fillna(-9999, inplace=True)

In [None]:
forecast_col = 'Adj. Close' # column that we are making predictions on

In [None]:
forecast_out = int(math.ceil(0.01 * len(df))) # 1% of days (rows), approx 35 days into the future

In [None]:
df.tail(50)

In [None]:
df['label'] = df[forecast_col].shift(-forecast_out)

In [None]:
df.tail(50)

In [None]:
df.isna().sum()

Label is like our end goal?

In [None]:
df.isna().sum()

Features are 'X' and labels are 'y'

In [None]:
X = np.array(df.drop(['label'], axis=1))
print(X)

In [None]:
X = preprocessing.scale(X) # in that it would subtract the mean of your points first, then divide by the standard deviation
print(X[-36])

In [None]:
X_lately = X[-forecast_out:] # last 35 rows, this is the stuff that we are gonna predcit about
print(X_lately)

In [None]:
X = X[:-forecast_out] # every till last 35 rows
print(X)

In [None]:
df.dropna(inplace=True)

In [None]:
y = np.array(df['label'])
print(y)

In [None]:
df.describe()

In [None]:
X

In [None]:
len(X), len(y) # make sure they are equal len

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [None]:
classifier = LinearRegression()
classifier.fit(X_train, y_train) # fit for training data

In [None]:
with open('linearregression.pickle', 'wb') as f: # saving classifier with pickle to avoid training it again
    pickle.dump(classifier, f)

In [None]:
accuracy = classifier.score(X_test, y_test) # score for test data

Why do we want to train and test on separated data? Because if we train classifier on the same data that we are testing, it will already know the answers so it does not make any sense.

In [None]:
print(f'{accuracy * 100:.2f}%')

In [None]:
print(forecast_out)

What if we want to use different algorithm?

In [None]:
classifier_svm = svm.SVR()
classifier_svm.fit(X_train, y_train) # fit for training data
accuracy_svm = classifier_svm.score(X_test, y_test) # score for test data

In [None]:
print(f'{accuracy_svm * 100:.2f}%')

We can use many cpu threads with some algortihms like linear regression for example. n_jobs parameter defines it. It affects the training part a lot, it can speed it up. n_jobs=-1 runs as much threads as our processor can handle.

In [None]:
forecast_set = classifier.predict(X_lately) # we can pass single value or an array

In [None]:
forecast_set

In [None]:
style.use('ggplot')

In [None]:
df['Forecast'] = np.nan

In [None]:
last_date = df.iloc[-1].name # because date is in our name (index)

In [None]:
last_unix = last_date.timestamp() # in unix-style time

In [None]:
one_day = 86400

In [None]:
next_unix = last_unix + one_day

In [None]:
for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]

In [None]:
df[['Adj. Close', 'Forecast']].plot(figsize=(12, 8))

**Pickling and Scaling**

Pickle is serialzation of any python object like dict or classifier. We use it to save a classifier to avoid training step.

In [None]:
pickle_in = open('linearregression.pickle','rb')

In [None]:
clf = pickle.load(pickle_in)

In [None]:
clf.score(X_test, y_test) # still works

**Writing linear regression algorithm**

Linear regression needs to have relationship between y and x axis, if there is no clear relation ship between them, it will be hard to find the best fit line between them, so linear regression will not be very beneficial there. For example if both y and x values are getting higher with each data point, we can clearly see that there is relationship between them.

y = mx + b, we usually have x values so we can plug it right in, but we do not have m or b values. Whate are they?

m = slope of the line, b = y intercept                           

m = (mean(x) * mean(y) - mean(xy)) / (mean(x)^2 - mean(x^2))
b = mean(y) - m*mean(x)

y = (mean(x) * mean(y) - mean(xy)) / (mean(x)^2 - mean(x^2)) * x + mean(y) - m*mean(x)

It works on 2d data.