## Predicting Stock Prices - K-nearest neighbour

### Import Key libraries

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime 
from sklearn.linear_model import LogisticRegression
import sklearn
import pandas_datareader.data as web
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

  return f(*args, **kwds)
  return f(*args, **kwds)


### Function - Create dataset

In [12]:
def create_dataset(stock_symbol, start_date, end_date, lags=5):
    # Fetch the stock data from Yahoo Finance
    df = web.DataReader(stock_symbol, "yahoo", start_date, end_date)
#     print(df.head())
    
    # create a new dataframe 
    # we want to use additional features: lagged returns... today's returns, yesterday's returns etc
    tslag = pd.DataFrame(index=df.index)
    tslag["Today"] = df["Adj Close"]
    tslag["Volume"] = df["Volume"]
#     print(tslag.head())
    
    # Create the shifted lag series of prior trading close values 
    for i in range(0, lags):
        tslag["Lags%s" %str(i+1)] = df["Adj Close"].shift(i+1)
    
#     print(tslag.head())
    
    # create the returns dataframe
    dfret = pd.DataFrame(index=tslag.index)
    dfret["Volume"] = tslag["Volume"]
    dfret["Today"] = tslag["Today"].pct_change()*100.0
#     print(dfret.head())
    
    # Create the lagged returns columns
    for i in range(0, lags):
        dfret["Lag%s" %str(i+1)] = tslag["Lags%s" %str(i+1)].pct_change()*100.0
#     print(dfret.head())
        
    # "Direction" column (+1 or -1) indicating an up/down day
    dfret["Direction"] = np.sign(dfret["Today"])
#     print(dfret.head())
    
    # Because of the shifts there are NaN values... we want to get rid of those NaNs
    dfret.drop(dfret.index[:6], inplace=True)
#     print(dfret.head())
    
    return dfret

In [13]:
create_dataset(stock_symbol = ['AAPL'], start_date=datetime(2012,1,1), end_date=datetime(2017,5,31), lags=5)

Unnamed: 0_level_0,Volume,Today,Lag1,Lag2,Lag3,Lag4,Lag5,Direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-01-11,53771200.0,-0.163055,0.358038,-0.158595,1.045351,1.110218,0.537424,-1.0
2012-01-12,53146800.0,-0.274494,-0.163055,0.358038,-0.158595,1.045351,1.110218,-1.0
2012-01-13,56505400.0,-0.374966,-0.274494,-0.163055,0.358038,-0.158595,1.045351,-1.0
2012-01-17,60724300.0,1.164815,-0.374966,-0.274494,-0.163055,0.358038,-0.158595,1.0
2012-01-18,69197800.0,1.038393,1.164815,-0.374966,-0.274494,-0.163055,0.358038,1.0
2012-01-19,65434600.0,-0.316925,1.038393,1.164815,-0.374966,-0.274494,-0.163055,-1.0
2012-01-20,103493600.0,-1.741697,-0.316925,1.038393,1.164815,-0.374966,-0.274494,-1.0
2012-01-23,76515600.0,1.691666,-1.741697,-0.316925,1.038393,1.164815,-0.374966,1.0
2012-01-24,136909500.0,-1.637779,1.691666,-1.741697,-0.316925,1.038393,1.164815,-1.0
2012-01-25,239578500.0,6.243908,-1.637779,1.691666,-1.741697,-0.316925,1.038393,1.0


### Run - knn model for stock price prediction

In [18]:
if __name__ == '__main__':
    # Create a lagged series of the S&P US stock market index
    data = create_dataset("AAPL", datetime(2012,1,1), datetime(2017,5,31), lags=5)
    
    # Use the prior days of returns as predictor
    # values, with direction as the response 
    X = data[['Lag1','Lag2','Lag3','Lag4']]
    y = data["Direction"]
    
    # The test data is split into two parts: before and after 1st Jan 2005
    start_test = datetime(2017, 1,1)
    
    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]
    
    # we use K as the machine learning model
    model = KNeighborsClassifier(300)
    
    #train the model on the training set
    model.fit(X_train, y_train)
    
    # Make an array of predictions on the test set
    pred = model.predict(X_test)
    
    # Output the hit-rate and the confusion matric for the model
    print("Accuracy of kNN model: %0.3f" % model.score(X_test,y_test))
    print("Confusion matric: \n%s" % confusion_matrix(pred, y_test))
    
    

Accuracy of kNN model: 0.621
Confusion matric: 
[[18 13]
 [26 46]]
