In [7]:
%matplotlib inline
import os
import inspect
import logging
from mysql import connector
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

class AQ:

    # Global constants
    WS_PATH = os.path.abspath("./") + os.sep
    DB_HOST = "127.0.0.1"
    DB_USR = "root"
    DB_PWD = "!QAZ2wsx#EDC"
    DB_NAME = "aq"
    
    def log(self, msg):
        print(msg)
        
class ML:
    def __init__(self, aq):
        self.aq = aq
    
    def get_data(self, code, type):
        mysql_connector = connector.connect(host=self.aq.DB_HOST, database=self.aq.DB_NAME,
                                            user=self.aq.DB_USR, password=self.aq.DB_PWD)
        data = pd.read_sql("""SELECT close, high-low as hl, close-open as oc FROM future_trade WHERE code='%s' AND type='%s' """ % (code, type), 
                              con=mysql_connector)
        data_1 = data.close.diff()
        data_1[0] = 0
        data_2 = data.hl
        data_3 = data.oc
        data = pd.DataFrame({"data_1":data_1, "data_2":data_2, "data_3":data_3})
        data = pd.DataFrame(preprocessing.normalize(data), columns=["data_1", "data_2", "data_3"])
        mysql_connector.close()
        return data
    
    def get_Y(self, data, lag_1, lag_2, lag_3):
        Y = data.data_1[max(lag_1, lag_2, lag_3):]
        Y.index = range(len(Y))
        Y = Y.apply(lambda x: x >= 0)
        return Y

    def get_X(self, data, lag_1, lag_2, lag_3):
        X = pd.DataFrame(columns=range(lag_1 + lag_2 + lag_3))
        for idx, row in data.iterrows():
            if idx >= lag_1 and idx >= lag_2 and idx >= lag_3:
                data_1 = data.iloc[(idx - lag_1):idx, 0]
                data_2 = data.iloc[(idx - lag_2):idx, 1]
                data_3 = data.iloc[(idx - lag_3):idx, 2]
                xrow = pd.concat([data_1, data_2, data_3], ignore_index=True)
                X = X.append(xrow, ignore_index=True)
        return X

    def cross_check(self, model, k_fold, X, Y):
        hit_rate_sum = 0
        stride = round(len(Y) / k_fold)
        for i in range(0, k_fold):
            start = i * stride
            stop = i * stride + stride - 1
            if (i == k_fold - 1):
                stop = len(Y)            
            X_test = X[start:stop]
            Y_test = Y[start:stop]
            if (i == 0):
                X_train = X[stop:]
                Y_train = Y[stop:]
            else:
                X_train = pd.concat([X[0:start], X[stop:]])
                Y_train = pd.concat([Y[0:start], Y[stop:]])
            model.fit(X_train, Y_train)
            hit_rate = np.sum(model.predict(X_test) == Y_test) / (stop-start)
            hit_rate_sum += hit_rate
            #self.aq.log("  k_fold=%s, %.2f%s, start=%d, stop=%d, test_len=%d, train_len=%d" %
            #               (i+1, hit_rate*100 , "%", start, stop, len(Y_test), len(Y_train)))
        return hit_rate_sum/k_fold

    def ml(self, code, type, lags, k_fold):
        self.aq.log("Code=%s, Type=%s, lags=%s, k_fold=%s" % (code,type,lags,k_fold))
        
        mysql_connector = connector.connect(host=self.aq.DB_HOST, database=self.aq.DB_NAME,
                                            user=self.aq.DB_USR, password=self.aq.DB_PWD)
        
        data = self.get_data(code, type)

        lag_1 = lags[0]
        lag_2 = lags[1]
        lag_3 = lags[2]
        
        X = self.get_X(data, lag_1, lag_2, lag_3)
        Y = self.get_Y(data, lag_1, lag_2, lag_3)
        
        
        model = ExtraTreesClassifier()
        model.fit(X, Y)
        self.aq.log(model.feature_importances_)
        self.aq.log("")
        
        self.aq.log("Logistic Regression")
        model = LogisticRegression()
        hit_rate = self.cross_check(model, k_fold, X, Y)
        self.aq.log("Average Hit Rate = %g%s" % (hit_rate*100, "%"))
        self.aq.log(" ")
        
        self.aq.log("Naive Bayes")
        model = GaussianNB()
        hit_rate = self.cross_check(model, k_fold, X, Y)
        self.aq.log("Average Hit Rate = %g%s" % (hit_rate * 100, "%"))
        self.aq.log("")
        
        self.aq.log("K Neighbors")
        model = KNeighborsClassifier()
        hit_rate = self.cross_check(model, k_fold, X, Y)
        self.aq.log("Average Hit Rate = %g%s" % (hit_rate * 100, "%"))
        self.aq.log("")
        
        self.aq.log("Decision Tree")
        model = DecisionTreeClassifier()
        hit_rate = self.cross_check(model, k_fold, X, Y)
        self.aq.log("Average Hit Rate = %g%s" % (hit_rate * 100, "%"))
        self.aq.log("")
        
        self.aq.log("Support Vector Machine")
        model = SVC()
        hit_rate = self.cross_check(model, k_fold, X, Y)
        self.aq.log("Average Hit Rate = %g%s" % (hit_rate * 100, "%"))
        self.aq.log("")
        
        mysql_connector.close()

In [8]:
ml = ML(AQ())
ml.ml("I", "d", [1, 1, 1], 10)
ml.ml("I", "d", [5, 5, 5], 10)
ml.ml("I", "d", [10, 10, 10], 10)
ml.ml("I", "d", [20, 20, 20], 10)
ml.ml("I", "d", [50, 50, 50], 10)
ml.ml("I", "d", [100, 100, 100], 10)

Code=I, Type=d, lags=[1, 1, 1], k_fold=10
[ 0.32783942  0.3368371   0.33532348]

Logistic Regression
Average Hit Rate = 52.2833%
 
Naive Bayes
Average Hit Rate = 51.9556%

K Neighbors
Average Hit Rate = 48.876%

Decision Tree
Average Hit Rate = 51.1593%

Support Vector Machine
Average Hit Rate = 50.0403%

Code=I, Type=d, lags=[5, 5, 5], k_fold=10
[ 0.06025486  0.06645201  0.06586262  0.07436459  0.06250744  0.06256207
  0.06641691  0.06521054  0.06527944  0.06575098  0.06452364  0.07180528
  0.07221069  0.06749643  0.0693025 ]

Logistic Regression
Average Hit Rate = 53.5323%
 
Naive Bayes
Average Hit Rate = 52.2688%

K Neighbors
Average Hit Rate = 51.4516%

Decision Tree
Average Hit Rate = 51.4624%

Support Vector Machine
Average Hit Rate = 54.3602%

Code=I, Type=d, lags=[10, 10, 10], k_fold=10
[ 0.03764778  0.03937292  0.03034663  0.034795    0.02369454  0.03600655
  0.03493189  0.0379902   0.0392281   0.0270575   0.02834396  0.03574585
  0.04245804  0.03221459  0.03404298  0.03152246