## Predicting Stock Prices - K-nearest neighbour

### Import Key libraries

In [6]:
import numpy as np
import pandas as pd
from datetime import datetime 
from sklearn.linear_model import LogisticRegression
import sklearn
import pandas_datareader.data as web
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix

### Function - Create dataset

In [7]:
def create_dataset(stock_symbol, start_date, end_date, lags=5):
    # Fetch the stock data from Yahoo Finance
    df = web.DataReader(stock_symbol, "yahoo", start_date, end_date)
#     print(df.head())
    
    # create a new dataframe 
    # we want to use additional features: lagged returns... today's returns, yesterday's returns etc
    tslag = pd.DataFrame(index=df.index)
    tslag["Today"] = df["Adj Close"]
    tslag["Volume"] = df["Volume"]
#     print(tslag.head())
    
    # Create the shifted lag series of prior trading close values 
    for i in range(0, lags):
        tslag["Lags%s" %str(i+1)] = df["Adj Close"].shift(i+1)
    
#     print(tslag.head())
    
    # create the returns dataframe
    dfret = pd.DataFrame(index=tslag.index)
    dfret["Volume"] = tslag["Volume"]
    dfret["Today"] = tslag["Today"].pct_change()*100.0
#     print(dfret.head())
    
    # Create the lagged returns columns
    for i in range(0, lags):
        dfret["Lag%s" %str(i+1)] = tslag["Lags%s" %str(i+1)].pct_change()*100.0
#     print(dfret.head())
        
    # "Direction" column (+1 or -1) indicating an up/down day
    dfret["Direction"] = np.sign(dfret["Today"])
#     print(dfret.head())
    
    # Because of the shifts there are NaN values... we want to get rid of those NaNs
    dfret.drop(dfret.index[:6], inplace=True)
#     print(dfret.head())
    
    return dfret

In [8]:
# create_dataset(stock_symbol = ['AAPL'], start_date=datetime(2012,1,1), end_date=datetime(2017,5,31), lags=5)

### Run - knn model for stock price prediction

In [10]:
if __name__ == '__main__':
    # Create a lagged series of the S&P US stock market index
    data = create_dataset("AAPL", datetime(2012,1,1), datetime(2017,5,31), lags=5)
    
    # Use the prior days of returns as predictor
    # values, with direction as the response 
    X = data[['Lag1','Lag2','Lag3','Lag4']]
    y = data["Direction"]
    
    # The test data is split into two parts: before and after 1st Jan 2005
    start_test = datetime(2017, 1,1)
    
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]
    
    # we use K as the machine learning model
#     model = KNeighborsClassifier(300)
#     model = LinearSVC()
    model = SVC(C=1000000.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0001, kernel='rbf', 
                max_iter=1, probability=False)
    
    #train the model on the training set
    model.fit(X_train, y_train)
    
    # Make an array of predictions on the test set
    pred = model.predict(X_test)
    
    # Output the hit-rate and the confusion matric for the model
    print("Accuracy of model: %0.3f" % model.score(X_test,y_test))
    print("Confusion matric: \n%s" % confusion_matrix(pred, y_test))
    
    

Accuracy of model: 0.466
Confusion matric: 
[[31  0 40]
 [ 1  0  2]
 [12  0 17]]


