In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.naive_bayes import GaussianNB

In [2]:
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

print(train_raw)



             Date  Open Price  Close Price  High Price  Low Price      Volume
0     02-Jan-2009      902.99       931.80      934.73     899.35  4048270080
1     05-Jan-2009      929.17       927.45      936.63     919.53  5413910016
2     06-Jan-2009      931.17       934.70      943.85     927.28  5392620032
3     07-Jan-2009      927.45       906.65      927.45     902.37  4704940032
4     08-Jan-2009      905.73       909.73      910.00     896.81  4991549952
...           ...         ...          ...         ...        ...         ...
2259  22-Dec-2017     2684.22      2683.34     2685.35    2678.13  1383888512
2260  26-Dec-2017     2679.09      2680.50     2682.74    2677.96  1103808384
2261  27-Dec-2017     2682.10      2682.62     2685.64    2678.91  1149108352
2262  28-Dec-2017     2686.10      2687.54     2687.66    2682.69  1126089856
2263  29-Dec-2017     2689.15      2673.61     2692.12    2673.61  1332374016

[2264 rows x 6 columns]


## Preprocess

function preProcess  
> input: Raw dataset, pandas dataframe  
> output: pandas dataframe, 加上了新的欄位  

##### add columns:

|Column|Description|Values|
|------|-----------|------|
|day_grow|今日股價上升|1: 上升或不動, 0: 下降|
|day_diff|今日股價上升量|浮點數, 可能為負|
|diff_high|今日最高價-今日開盤價|浮點數, 應該非負|
|diff_low|今日開盤價-今日最低價|浮點數, 應該非負|
|high_low|今日最高價-今日最低價|浮點數, 應該非負|
|Open_diff|今日開盤價-前日開盤價, 首日為0|浮點數, 可能為負|
|Volume_M|交易量, 百萬單位|浮點數, 應該非負|


### 讀取train set與test set, 並完成preprocess

In [3]:
def preProcess(inset):
    outset = inset.copy()
    outset.loc[:, 'day_grow'] = list(map(lambda x: 1 if x else 0, outset.loc[:,'Close Price'] >= outset.loc[:, 'Open Price']))
    outset.loc[:, 'day_diff'] = outset.loc[:,'Close Price'] - outset.loc[:, 'Open Price']
    outset.loc[:, 'diff_high'] = outset.loc[:, 'High Price'] - outset.loc[:, 'Open Price']
    outset.loc[:, 'diff_low'] = outset.loc[:, 'Open Price'] - outset.loc[:, 'Low Price']
    outset.loc[:, 'high_low'] = outset.loc[:, 'High Price'] - outset.loc[:, 'Low Price']
    outset.loc[0, 'Open_diff'] = 0
    outset.loc[1:, 'Open_diff'] = (outset.loc[:, 'Open Price'].values[1:] - outset.loc[:, 'Open Price'].values[0:len(outset)-1])
    outset.loc[:, 'Volume_M'] = outset.loc[:, 'Volume'] / 1000000
    return outset

train = preProcess(train_raw)
testset = preProcess(test_raw)
print(train)

             Date  Open Price  Close Price  High Price  Low Price      Volume  \
0     02-Jan-2009      902.99       931.80      934.73     899.35  4048270080   
1     05-Jan-2009      929.17       927.45      936.63     919.53  5413910016   
2     06-Jan-2009      931.17       934.70      943.85     927.28  5392620032   
3     07-Jan-2009      927.45       906.65      927.45     902.37  4704940032   
4     08-Jan-2009      905.73       909.73      910.00     896.81  4991549952   
...           ...         ...          ...         ...        ...         ...   
2259  22-Dec-2017     2684.22      2683.34     2685.35    2678.13  1383888512   
2260  26-Dec-2017     2679.09      2680.50     2682.74    2677.96  1103808384   
2261  27-Dec-2017     2682.10      2682.62     2685.64    2678.91  1149108352   
2262  28-Dec-2017     2686.10      2687.54     2687.66    2682.69  1126089856   
2263  29-Dec-2017     2689.15      2673.61     2692.12    2673.61  1332374016   

      day_grow  day_diff  d

#### 決定model使用的input  columns
> 每個model都有他們的use_cols  
>> use_cols1, use_cols2, ...      
>  
> use_cols_global當作default值  

In [4]:
use_cols_global = ['diff_high', 'diff_low', 'Open_diff']

## Logistic Regression

一開始用日期以外的所有raw data輸入，準確度大約0.5，所有的g(x)都是1，吐血  
改用diff_high跟diff_low到了82.9%  
但是加入了Open_diff之後準確度下降了所以拿掉

In [5]:
model1 = LogisticRegression(random_state=9487, max_iter=800)

# global
use_cols1 = use_cols_global

# Open, High, Low, Vol
# use_cols1 = ['Open Price', 'High Price', 'Low Price', 'Volume']

# diff_high, diff_low
use_cols1 = ['diff_high', 'diff_low']

# diff_high, diff_low, high_low
# use_cols1 = ['diff_high', 'diff_low', 'high_low']

model1 = model1.fit(train.loc[:, use_cols1], train.loc[:, 'day_grow'])
test_ans1 = model1.predict(testset.loc[:, use_cols1])

test_acc1 = sum(test_ans1 == testset.loc[:,'day_grow'])/len(testset)
print(test_acc1)

0.8293650793650794


## Neural Network Classifier

model2:  
> hidden layers: 10, 8, 4, 2  
> input columns: default: diff_high, diff_low, Open_diff  

把learing rate從default的0.001提高到0.003之後準確度上升了一點  
現在是83.33%

In [6]:
#model2 = MLPClassifier(hidden_layer_sizes=(10,8,4,2), batch_size=30, learning_rate_init=0.001, max_iter=200, shuffle=True, random_state=9487)
model2 = MLPClassifier(hidden_layer_sizes=(10,8,4,2), batch_size=30, learning_rate_init=0.003, max_iter=200, shuffle=True, random_state=9487)

# global
use_cols2 = use_cols_global

# Open, diff_high, diff_low, Volume
# use_cols2 = ['Open Price', 'diff_high', 'diff_low']

model2 = model2.fit(train.loc[:, use_cols2], train.loc[:, 'day_grow'])
test_ans2 = model2.predict(testset.loc[:, use_cols2])

test_acc2 = sum(test_ans2 == testset.loc[:, 'day_grow'])/len(testset)
print(test_acc2)

0.8333333333333334


model2_1:  
  
hidden layer: (10,8,4,2) --> (6,6,6,6,6,6,6)  
accuracy: no change :(

In [7]:
#model2_1 = MLPClassifier(hidden_layer_sizes=(8,4,4,4,3,2), batch_size=30, learning_rate_init=0.003, max_iter=200, shuffle=True, random_state=9487)
model2_1 = MLPClassifier(hidden_layer_sizes=(6,6,6,6,6,6,6), batch_size=30, learning_rate_init=0.003, max_iter=200, shuffle=True, random_state=9487)

# global
use_cols2_1 = use_cols_global

# Open, diff_high, diff_low, Volume
# use_cols2_1 = ['Open Price', 'diff_high', 'diff_low', 'Volume']

model2_1 = model2_1.fit(train.loc[:, use_cols2_1], train.loc[:, 'day_grow'])
test_ans2_1 = model2_1.predict(testset.loc[:, use_cols2_1])

test_acc2_1 = sum(test_ans2_1 == testset.loc[:, 'day_grow'])/len(testset)
print(test_acc2_1)

0.8333333333333334


## Naive Bayes

model3:  
> Gaussian naive bayes  
> input columns: default: diff_high, diff_low, Open_diff  
> acc: 82.9365%  


In [8]:
model3 = GaussianNB()

# global
use_cols3 = use_cols_global

# Open, diff_high, diff_low, Volume
# use_cols3 = ['Open Price', 'diff_high', 'diff_low' 'Volume']

# originals
#use_cols3 = ['Open Price', 'High Price', 'Low Price', 'Volume']

model3 = model3.fit(train.loc[:, use_cols3], train.loc[:, 'day_grow'])
test_ans3 = model3.predict(testset.loc[:, use_cols3])

test_acc3 = sum(test_ans3 == testset.loc[:, 'day_grow'])/len(testset)
print(test_acc3)

0.8293650793650794


## Neural Network Regressor

想說不是以1跟0為target, 而是數值會不會比較準  

model4:  
> input columns: diff_high, diff_low
>> 因為default的準度比較低, 所以拿掉了Open_diff  
>
> acc: 83.73%

In [9]:
model4 = MLPRegressor(hidden_layer_sizes=(10,8,4,2), batch_size=30, learning_rate_init=0.003, max_iter=500, shuffle=True, random_state=9487)

# global
use_cols4 = use_cols_global

# Open, diff_high, diff_low, Volume
# use_cols4 = ['Open Price', 'diff_high', 'diff_low']

# diff_high, diff_low
use_cols4 = ['diff_high', 'diff_low']

model4 = model4.fit(train.loc[:, use_cols4], train.loc[:, 'day_diff'])
test_ans4 = model4.predict(testset.loc[:, use_cols4])
test_ans4 = list(map(lambda x: 1 if x>=0 else 0 , test_ans4))

test_acc4 = sum(test_ans4 == testset.loc[:, 'day_grow'])/len(testset)
print(test_acc4)

0.8373015873015873


model4_1:
> hidden layer size: (10,8,4,2) --> (6,6,6,6,6,6,6)  
> use_cols: default: diff_high, diff_low, Open_diff  
> initial learning rate: 0.003 --> 0.005  
> acc: 83.73% --> 84.52%  

提升learning rate後acc上升了  
應該就湊巧給我碰到跟random_state=9487相性好一點的地方

In [10]:
model4_1 = MLPRegressor(hidden_layer_sizes=(6,6,6,6,6,6,6), batch_size=30, learning_rate_init=0.005, max_iter=500, shuffle=True, random_state=9487)

# global
use_cols4_1 = use_cols_global

# Open, diff_high, diff_low, Volume
# use_cols4_1 = ['Open Price', 'diff_high', 'diff_low']

# diff_high, diff_low
# use_cols4_1 = ['diff_high', 'diff_low']

model4_1 = model4_1.fit(train.loc[:, use_cols4_1], train.loc[:, 'day_diff'])
test_ans4_1 = model4_1.predict(testset.loc[:, use_cols4_1])
test_ans4_1 = list(map(lambda x: 1 if x>=0 else 0 , test_ans4_1))

test_acc4_1 = sum(test_ans4_1 == testset.loc[:, 'day_grow'])/len(testset)
print(test_acc4_1)

0.8452380952380952


# 結論

結果還是Neural Network最準...  
柔後Regression比Classification準  
就準一點, 可能只在這個dataset上比較好  
換個dataset可能就不是這樣了