In [1]:
import pandas as pd
import numpy as np
from talib import BBANDS, SAR, RSI, STOCH, EMA, WILLR
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

train_df = pd.read_csv("./training.csv")
test_df = pd.read_csv("./testing.csv")
train_df.columns = ("open", "high", "low", "close")
test_df.columns = ("open", "high", "low", "close")

In [2]:
# Do MinMax normalization
maxValue = train_df.to_numpy().max()
minValue = train_df.to_numpy().min()
diff = maxValue - minValue
train = train_df.transform(lambda x: (x - minValue) / diff)
test = test_df.transform(lambda x: (x - minValue) / diff)

In [3]:
train["upperband"], train["middleband"], train["lowerband"] = BBANDS(train.close.to_numpy())
train["sar"] = SAR(train.high.to_numpy(), train.low.to_numpy())
train["rsi"] = RSI(train.close.to_numpy(), timeperiod=5)
train["slowk"], train["slowd"] = STOCH(train.high.to_numpy(), train.low.to_numpy(), train.close.to_numpy())
train["ema"] = EMA(train.close.to_numpy(), timeperiod=5)
train["willr"] = WILLR(train.high.to_numpy(), train.low.to_numpy(), train.close.to_numpy(), timeperiod=9)
train.isnull().sum()

open          0
high          0
low           0
close         0
upperband     4
middleband    4
lowerband     4
sar           1
rsi           5
slowk         8
slowd         8
ema           4
willr         8
dtype: int64

In [4]:
train_data = train.dropna()
train_data = train_data.reset_index(drop=True)
train_data

Unnamed: 0,open,high,low,close,upperband,middleband,lowerband,sar,rsi,slowk,slowd,ema,willr
0,0.641010,0.657576,0.630505,0.637374,0.666021,0.644121,0.622222,0.676326,23.529738,35.196490,34.271958,0.644867,-70.489978
1,0.635657,0.653535,0.632323,0.648182,0.665042,0.643636,0.622231,0.669754,37.924725,46.049189,38.976942,0.645972,-51.562500
2,0.655455,0.661212,0.640909,0.642626,0.653047,0.640000,0.626953,0.663839,33.832650,57.668859,46.304846,0.644857,-55.539972
3,0.695657,0.727980,0.685354,0.723434,0.724624,0.656101,0.587578,0.610606,77.659880,75.844229,59.854092,0.671049,-3.872633
4,0.717273,0.743636,0.714848,0.738182,0.765032,0.677960,0.590887,0.612954,80.592423,84.859797,72.790962,0.693427,-4.100228
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1474,0.354040,0.356566,0.347778,0.352929,0.367668,0.344525,0.321382,0.302071,67.065605,87.132992,83.501390,0.345742,-9.775641
1475,0.355152,0.359495,0.350707,0.354343,0.361821,0.350606,0.339391,0.304352,68.454615,83.240226,84.920261,0.348609,-8.121019
1476,0.358081,0.363939,0.351515,0.359293,0.359533,0.354566,0.349598,0.307660,73.368544,78.956152,83.109790,0.352170,-6.845238
1477,0.362727,0.370808,0.356263,0.370202,0.371149,0.357798,0.344447,0.312163,81.365806,83.695902,81.964093,0.358181,-0.981997


In [5]:
y = list()
for i in range(len(train_data)):
    isBull = (train_data["open"][i] > train_data["sar"][i], 
              train_data["open"][i] >= train_data["middleband"][i],
              train_data["rsi"][i] > 50,
              train_data["slowk"][i] >= train_data["slowd"][i],
              train_data["open"][i] >= train_data["ema"][i],
              train_data["willr"][i] > -50)
    if np.count_nonzero(isBull) > 4:
        y.append(2)
    elif np.count_nonzero(isBull) < 2:
        y.append(0)
    else:
        y.append(1)
y = np.array(y, dtype=np.int)

In [6]:
X = list()
for i in range(20, len(train_data)):
    X.append(train_data.loc[i-20:i-1, :].values)
X = np.array(X)

In [7]:
y = y[39:]
len(y)

1440

In [8]:
test = X[-20:]
len(test)

20

In [9]:
new_X = X[:-19]

new_X = new_X.reshape((len(y), -1))

In [10]:
new_X.shape

(1440, 260)

In [11]:
X_train, X_val, y_train, y_val = train_test_split(new_X, y, test_size=0.2, shuffle=False)

In [None]:
xgb = XGBClassifier(learning_rate=0.1, 
                    objective='multi:softmax',
                    num_class=3,
                    n_estimators=1000, max_depth=1, min_child_weight=2, use_label_encoder=False)
# model = xgb.fit(X_train, y_train,
#                eval_set=[(X_val, y_val)],
#               eval_metric="auc",
#                verbose=True)

In [None]:
parameters = {
    'max_depth': list(range(1, 10)),
    'min_child_weight': list(range(1, 10)),
    "n_estimators": list(range(100, 1001, 100))
}
gsearch = GridSearchCV(xgb, param_grid=parameters, scoring="f1", cv=2)
gsearch.fit(X_train, y_train,  eval_set=[(X_val, y_val)], eval_metric="auc", verbose=True)
best_parameters = gsearch.best_estimator_.get_params()

In [None]:
print(best_parameters)

In [12]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=30,
                    objective='multi:softmax',
                    num_class=3,
                    max_depth=3, min_child_weight=10, use_label_encoder=False)
model = xgb.fit(X_train, y_train,
                eval_set=[(X_val, y_val)],
                eval_metric="mlogloss",
                verbose=True)

[0]	validation_0-mlogloss:1.09854
[1]	validation_0-mlogloss:1.10165
[2]	validation_0-mlogloss:1.10289
[3]	validation_0-mlogloss:1.10274
[4]	validation_0-mlogloss:1.10441
[5]	validation_0-mlogloss:1.10436
[6]	validation_0-mlogloss:1.10632
[7]	validation_0-mlogloss:1.10813
[8]	validation_0-mlogloss:1.10730
[9]	validation_0-mlogloss:1.10766
[10]	validation_0-mlogloss:1.10838
[11]	validation_0-mlogloss:1.11075
[12]	validation_0-mlogloss:1.11190
[13]	validation_0-mlogloss:1.11379
[14]	validation_0-mlogloss:1.11490
[15]	validation_0-mlogloss:1.11403
[16]	validation_0-mlogloss:1.11722
[17]	validation_0-mlogloss:1.11736
[18]	validation_0-mlogloss:1.11997
[19]	validation_0-mlogloss:1.12234
[20]	validation_0-mlogloss:1.12322
[21]	validation_0-mlogloss:1.12569
[22]	validation_0-mlogloss:1.12771
[23]	validation_0-mlogloss:1.12862
[24]	validation_0-mlogloss:1.13231
[25]	validation_0-mlogloss:1.13351
[26]	validation_0-mlogloss:1.13357
[27]	validation_0-mlogloss:1.13474
[28]	validation_0-mlogloss:1.1

In [13]:
model.predict(test.reshape(20, -1))

array([1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0],
      dtype=int64)

In [None]:
test["upperband"], test["middleband"], test["lowerband"] = BBANDS(test.close.to_numpy())
# train["ma5"] = MA(train.close.to_numpy(), timeperiod=5)
test["sar"] = SAR(test.high.to_numpy(), test.low.to_numpy())
test["rsi"] = RSI(test.close.to_numpy(), timeperiod=5)
test["slowk"], test["slowd"] = STOCH(test.high.to_numpy(), test.low.to_numpy(), test.close.to_numpy())

In [None]:
predictions = model.predict(test.values)
len(predictions)

In [None]:
ans = []
val = 0
for i in range(1, len(predictions)):
    _sum = sum(predictions[i-1:i+1])
    if _sum == 2:
        val = 1 if (val == 1) else (val + 1) 
    elif _sum == 1:
        val = val
    else:
        val = -1 if (val == -1) else val - 1
    ans.append(val)
print(ans, len(ans), sep='\n\n')
with open("./output.csv", "w") as fp:
    for i in range(len(ans)):
        print(ans[i], file=fp)