<h1>Data</h1>

In [9]:
%matplotlib inline
from mysql import connector
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

class ML:
    
    def get_y(self, data, rtn_lag, vol_lag):
        y = data.chg[max(rtn_lag, vol_lag):]
        y.index = range(len(y))
        y = y.apply(lambda x: x > 0)
        return y

    def get_X(self, data, rtn_lag, vol_lag):
        X = pd.DataFrame(columns=range(rtn_lag + vol_lag))
        for idx, row in data.iterrows():
            if idx >= rtn_lag and idx >= vol_lag:
                vol = data.iloc[(idx - vol_lag):idx, 1]
                rtn = data.iloc[(idx - rtn_lag):idx, 2]
                xrow = pd.concat([rtn, vol], ignore_index=True)
                X = X.append(xrow, ignore_index=True)
        return X

    def cross_check(self, model, k_fold, X, y):
        hit_rate_sum = 0
        stride = round(len(y) / k_fold)
        for i in range(0, k_fold):
            start = i * stride
            stop = i * stride + stride - 1
            if (i == k_fold - 1):
                stop = len(y)
            X_test = X[start:stop]
            y_test = y[start:stop]
            X_train = pd.concat([X[0:start - 1], X[stop:]])
            y_train = pd.concat([y[0:start - 1], y[stop:]])
            model.fit(X_train, y_train)
            hit_rate = np.sum(model.predict(X_test) == y_test) / (stop-start)
            hit_rate_sum += hit_rate
            print("  k_fold=%s, %.2f%s" % (i+1, hit_rate*100 , "%"))
        return hit_rate_sum/k_fold

    def ml(self, code, type):
        mysql_connector = connector.connect(host="127.0.0.1", database="aq", user="root", password="!QAZ2wsx#EDC")

        data = pd.read_sql("""SELECT close, volume FROM future_trade WHERE code='%s' AND type='%s' """ % (code, type), 
                              con=mysql_connector)
        data["chg"] = data.close.diff()
        data.iloc[0, 2] = 0

        rtn_lag = 10
        vol_lag = 10
        k_fold = 20

        print("################################################################################")

        model = LogisticRegression()
        print("Code = %s" % code)
        print("Logistic Regression")
        hit_rate = self.cross_check(model, k_fold, self.get_X(data, rtn_lag, vol_lag), self.get_y(data, rtn_lag, vol_lag))
        print("Average Hit Rate = %g%s" % (hit_rate*100, "%"))

        print("################################################################################")

        model = GaussianNB()
        print("Code = %s" % code)
        print("Naive Bayes")
        hit_rate = self.cross_check(model, k_fold, self.get_X(data, rtn_lag, vol_lag), self.get_y(data, rtn_lag, vol_lag))
        print("Average Hit Rate = %g%s" % (hit_rate * 100, "%"))

        print("################################################################################")

        model = KNeighborsClassifier()
        print("Code = %s" % code)
        print("K Neighbors")
        hit_rate = self.cross_check(model, k_fold, self.get_X(data, rtn_lag, vol_lag), self.get_y(data, rtn_lag, vol_lag))
        print("Average Hit Rate = %g%s" % (hit_rate * 100, "%"))

        print("################################################################################")

        model = DecisionTreeClassifier()
        print("Code = %s" % code)
        print("Decision Tree")
        hit_rate = self.cross_check(model, k_fold, self.get_X(data, rtn_lag, vol_lag), self.get_y(data, rtn_lag, vol_lag))
        print("Average Hit Rate = %g%s" % (hit_rate * 100, "%"))

        print("################################################################################")

        model = SVC()
        print("Code = %s" % code)
        print("Support Vector Machine")
        hit_rate = self.cross_check(model, k_fold, self.get_X(data, rtn_lag, vol_lag), self.get_y(data, rtn_lag, vol_lag))
        print("Average Hit Rate = %g%s" % (hit_rate * 100, "%"))

        print("################################################################################")                

        mysql_connector.close()

In [10]:
ml = ML()
ml.ml("P", "d")

################################################################################
Code = P
Logistic Regression
  k_fold=1, 51.72%
  k_fold=2, 65.52%
  k_fold=3, 44.83%
  k_fold=4, 58.62%
  k_fold=5, 37.93%
  k_fold=6, 41.38%
  k_fold=7, 48.28%
  k_fold=8, 48.28%
  k_fold=9, 34.48%
  k_fold=10, 62.07%
  k_fold=11, 44.83%
  k_fold=12, 65.52%
  k_fold=13, 44.83%
  k_fold=14, 44.83%
  k_fold=15, 68.97%
  k_fold=16, 48.28%
  k_fold=17, 51.72%
  k_fold=18, 41.38%
  k_fold=19, 51.72%
  k_fold=20, 47.83%
Average Hit Rate = 50.1499%
################################################################################
Code = P
Naive Bayes
  k_fold=1, 62.07%
  k_fold=2, 55.17%
  k_fold=3, 55.17%
  k_fold=4, 51.72%
  k_fold=5, 44.83%
  k_fold=6, 62.07%
  k_fold=7, 48.28%
  k_fold=8, 55.17%
  k_fold=9, 24.14%
  k_fold=10, 55.17%
  k_fold=11, 51.72%
  k_fold=12, 58.62%
  k_fold=13, 41.38%
  k_fold=14, 62.07%
  k_fold=15, 58.62%
  k_fold=16, 65.52%
  k_fold=17, 44.83%
  k_fold=18, 41.38%
  k_fold=19, 48.28