In [3]:
'''This is an application of some of the basic ML algorithms on financial markets. Helper function creates a dataframe
that stores prices of ETF on Dow Jones index along with several lagged values. The purpoe of the analysis is that
determining the direction of movement in a manner that significantly exceeds a 50% hit rate with the help of
classifiers to predict the direction of the closing price at day t based solely on price information known at day t − 1
Logistic Regression,Support Vector Machine(SVM),Linear/Quadratic Discriminant Analysers (LDA/QDA), Linear Support Vector Classifier(LSVC),
Random Forest Classifier,Radial Support Vector Machine (RSVM) have been considered'''

from __future__ import print_function

import datetime
import numpy as np
import pandas as pd
import sklearn
import warnings
warnings.filterwarnings('ignore') 


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.metrics import confusion_matrix
from sklearn.qda import QDA
from sklearn.svm import LinearSVC, SVC
import os
path='/Users/aybarsatalay/Desktop/Python'
os.chdir(path)




In [5]:
DIA = pd.read_csv('DIA.csv',index_col=['Date'],parse_dates=True)

DIA.tail()

def create_lagged_series(ts, start_date, end_date, lags=5):
    """
    This creates a pandas DataFrame that stores the 
    percentage returns of the adjusted closing value of 
    a stock obtained from Yahoo Finance, along with a 
    number of lagged returns from the prior trading days 
    (lags defaults to 5 days). Trading volume, as well as 
    the Direction from the previous day, are also included.
    """

    # Create the new lagged DataFrame
    tslag = pd.DataFrame(index=ts.index)
    tslag["Today"] = ts["Adj Close"]
    tslag["Volume"] = ts["Volume"]

    # Create the shifted lag series of prior trading period close values
    for i in range(0, lags):
        tslag["Lag%s" % str(i+1)] = ts["Adj Close"].shift(i+1)

    # Create the returns DataFrame
    tsret = pd.DataFrame(index=tslag.index)
    tsret["Volume"] = tslag["Volume"]
    tsret["Today"] = tslag["Today"].pct_change()*100.0

    # If any of the values of percentage returns equal zero, set them to
    # a small number (stops issues with QDA model in scikit-learn)
    for i,x in enumerate(tsret["Today"]):
        if (abs(x) < 0.0001):
            tsret["Today"][i] = 0.0001

    # Create the lagged percentage returns columns
    for i in range(0, lags):
        tsret["Lag%s" % str(i+1)] = \
        tslag["Lag%s" % str(i+1)].pct_change()*100.0

    # Create the "Direction" column (+1 or -1) indicating an up/down day
    tsret["Direction"] = np.sign(tsret["Today"])
    tsret = tsret[tsret.index >= start_date]

    return tsret

diaret = create_lagged_series(DIA, datetime.datetime(2001,1,10),datetime.datetime(2017,12,31), lags=5)



In [6]:
if __name__ == "__main__":
    # Create a lagged series of the S&P500 US stock market index
    diaret = create_lagged_series(DIA, datetime.datetime(2001,1,10),datetime.datetime(2016,12,31), lags=5)

    # Use the prior two days of returns as predictor 
    # values, with direction as the response
    X = diaret[["Lag1","Lag2"]]
    y = diaret["Direction"]

    # The test data is split into two parts: Before and after 1st Jan 2005.
    start_test = datetime.datetime(2010,1,1)

    # Create training and test sets
    X_train = X[X.index < start_test]
    X_test = X[X.index >= start_test]
    y_train = y[y.index < start_test]
    y_test = y[y.index >= start_test]
   
    # Create the (parametrised) models
    print("Hit Rates/Confusion Matrices:\n")
    models = [("LR", LogisticRegression()), 
              ("LDA", LDA()), 
              ("QDA", QDA()),
              ("LSVC", LinearSVC()),
              ("RSVM", SVC(
                C=1000000.0, cache_size=200, class_weight=None,
                coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
                max_iter=-1, probability=False, random_state=None,
                shrinking=True, tol=0.001, verbose=False)
              ),
              ("RF", RandomForestClassifier(
                n_estimators=1000, criterion='gini', 
                max_depth=None, min_samples_split=2, 
                min_samples_leaf=1, max_features='auto', 
                bootstrap=True, oob_score=False, n_jobs=1, 
                random_state=None, verbose=0)
              )]

    # Iterate through the models
    for m in models:
        
        # Train each of the models on the training set
        m[1].fit(X_train, y_train)

        # Make an array of predictions on the test set
        pred = m[1].predict(X_test)

        # Output the hit-rate and the confusion matrix for each model
        print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
        print("%s\n" % confusion_matrix(pred, y_test))

Hit Rates/Confusion Matrices:

LR:
0.550
[[  22   37]
 [ 863 1079]]

LDA:
0.552
[[  22   34]
 [ 863 1082]]

QDA:
0.551
[[  39   52]
 [ 846 1064]]

LSVC:
0.553
[[  21   30]
 [ 864 1086]]

RSVM:
0.556
[[  22   26]
 [ 863 1090]]

RF:
0.512
[[375 467]
 [510 649]]



In [None]:
'''Given that all of the hit rates lie between 50% and 60%. 
we can conclude that the lagged variables, when used solely, are not hugely indicative of future direction'''