In [None]:
# libraries
import os
import numpy as np
import pandas as pd
import csv
import sklearn.feature_selection as fselection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# global attributes
training_dataroot = 'train.csv' # Training data file file named as 'train.csv'
testing_dataroot = 'test.csv'   # Testing data file named as 'test.csv'

In [None]:
# read data
df_train = pd.read_csv(training_dataroot)
df_test = pd.read_csv(testing_dataroot)

display(df_train.head(5))
display(df_test.head(5))
print("Number of training data: ", len(df_train))
print("Number of testing data: ", len(df_test))

Unnamed: 0,Date,Capacity,Turnover,Open,High,Low,Close,Change,Transaction,MA5,...,RSI,DJI,NASDAQ,SOX,SPX,ADR,twclose,twopen,Movement,rate
0,2010-01-04,39511138,2557720928,65.0,65.0,64.0,64.9,0.4,8255,64.5,...,23.98714,10583.96,2308.42,366.1,1133.0,11.58,8207.85,8277.71,0,0.154083
1,2010-01-05,38394084,2464115096,65.0,65.1,63.9,64.5,-0.4,9205,64.5,...,23.98714,10572.02,2308.71,366.4,1136.5,11.53,8211.4,8237.1,0,0.775194
2,2010-01-06,52734385,3390698544,64.5,64.9,63.7,64.9,0.4,12597,64.5,...,23.98714,10573.68,2301.09,366.3,1137.1,11.49,8327.62,8344.56,1,-0.616333
3,2010-01-07,53294614,3437221996,64.9,65.0,64.2,64.2,-0.7,11195,64.5,...,23.98714,10606.86,2300.05,362.3,1141.7,11.11,8237.42,8266.87,0,1.090343
4,2010-01-08,48047497,3068341466,63.5,64.3,63.5,64.0,-0.2,9804,64.5,...,23.98714,10618.19,2317.17,367.7,1145.0,11.1,8280.9,8291.52,0,-0.78125


Unnamed: 0,Date,Capacity,Turnover,Open,High,Low,Close,Change,Transaction,MA5,...,RSI,DJI,NASDAQ,SOX,SPX,ADR,twclose,twopen,Movement,rate
0,2025-01-02,45045125,47883206644,1070.0,1075.0,1055.0,1065.0,-10.0,74997,1081.0,...,50.001872,42392.27,19280.79,5021.5,5868.55,201.58,22832.06,23018.56,0,0.469484
1,2025-01-03,31244211,33728652860,1080.0,1085.0,1075.0,1075.0,10.0,28227,1079.0,...,52.9128,42732.13,19621.68,5163.6,5942.47,208.61,22908.3,23131.79,1,0.465116
2,2025-01-06,77874801,86585128825,1100.0,1125.0,1095.0,1125.0,50.0,130021,1086.0,...,64.151249,42706.56,19864.98,5310.1,5975.38,220.01,23547.71,23824.72,1,-2.222222
3,2025-01-07,54691485,62664127156,1150.0,1160.0,1130.0,1130.0,5.0,70657,1094.0,...,65.049589,42528.36,19489.68,5212.2,5909.03,211.42,23651.27,23522.29,1,1.769912
4,2025-01-08,49181518,54760296689,1110.0,1130.0,1105.0,1105.0,-25.0,71736,1100.0,...,57.315749,42635.2,19478.88,5162.3,5918.25,207.12,23407.33,23406.72,0,0.452489


Number of training data:  3681
Number of testing data:  206


In [None]:
# define X and create labels y
if 'twopen' in df_train.columns:
    X = df_train.drop(['Date', 'twopen', 'Movement', 'rate'], axis=1).copy()
else:
    X = df_train.drop(['Date', 'Movement', 'rate'], axis=1).copy()

# Create labels: 1 if next day's Open > today's Close, else 0
labels = (df_train['Open'].shift(-1) > df_train['Close']).astype(int)

# Drop last row because it has no next-day label
X = X.iloc[:-1].reset_index(drop=True)
y = labels[:-1].reset_index(drop=True).to_frame(name='label')

# quick check
print("X.shape:", X.shape, "X.type: ", type(X), "\n",  "y.shape:", y.shape, "y.type: ", type(y))
display(X.head(5))
display(y.head(5))

# check nan values
print("Number of NaN in X:", X.isna().sum().sum())
print("Number of NaN in y:", y.isna().sum().sum())

X.shape: (3680, 32) X.type:  <class 'pandas.core.frame.DataFrame'> 
 y.shape: (3680, 1) y.type:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Capacity,Turnover,Open,High,Low,Close,Change,Transaction,MA5,MA10,...,RSI14,STD20,ADX,RSI,DJI,NASDAQ,SOX,SPX,ADR,twclose
0,39511138,2557720928,65.0,65.0,64.0,64.9,0.4,8255,64.5,64.01,...,0.0,1.700433,55.4694,23.98714,10583.96,2308.42,366.1,1133.0,11.58,8207.85
1,38394084,2464115096,65.0,65.1,63.9,64.5,-0.4,9205,64.5,64.01,...,0.0,1.700433,55.4694,23.98714,10572.02,2308.71,366.4,1136.5,11.53,8211.4
2,52734385,3390698544,64.5,64.9,63.7,64.9,0.4,12597,64.5,64.01,...,7.142857,1.700433,55.4694,23.98714,10573.68,2301.09,366.3,1137.1,11.49,8327.62
3,53294614,3437221996,64.9,65.0,64.2,64.2,-0.7,11195,64.5,64.01,...,6.2954,1.700433,55.4694,23.98714,10606.86,2300.05,362.3,1141.7,11.11,8237.42
4,48047497,3068341466,63.5,64.3,63.5,64.0,-0.2,9804,64.5,64.01,...,6.073675,1.700433,55.4694,23.98714,10618.19,2317.17,367.7,1145.0,11.1,8280.9


Unnamed: 0,label
0,1
1,0
2,0
3,0
4,0


Number of NaN in X: 0
Number of NaN in y: 0


In [None]:
# train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# model and fit
pipeline = Pipeline([
    ('fselect', fselection.SelectKBest(k = 'all')),
    ('gnb', GaussianNB())
])

pipeline.fit(X_train, y_train.values.ravel())

y_pred = pipeline.predict(X_val)

# evaluate
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
display(acc)
display(f1)

0.5394021739130435

0.2802547770700637

In [None]:
# fine tuning with cross-validation
param_grid = {
    'fselect__score_func': [fselection.f_classif, fselection.mutual_info_classif, fselection.f_regression, fselection.mutual_info_regression],
    'fselect__k': range(1, X_train.shape[1] + 1, 1),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    refit=True,
    verbose=0
)

grid_search.fit(X_train, y_train.values.ravel())

0,1,2
,estimator,Pipeline(step...aussianNB())])
,param_grid,"{'fselect__k': range(1, 33), 'fselect__score_func': [<function f_c...002CE889967A0>, <function mut...002CE88994CC0>, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,score_func,<function mut...002CE88994CC0>
,k,31

0,1,2
,priors,
,var_smoothing,1e-09


In [None]:
# evaluate best model
best_model_C = grid_search.best_estimator_
y_val_pred = best_model_C.predict(X_val)

acc = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
display(acc)
display(f1)

0.5271739130434783

0.24347826086956523

In [None]:
# check the feature selected
selector = best_model_C['fselect']

# boolean mask of selected features
mask = selector.get_support()
print("Selected mask:", mask)

# indices of selected features
indices = selector.get_support(indices=True)
print("Selected indices:", indices)

# map to feature names (use feature_names_in_ if available, otherwise X_train.columns)
if hasattr(selector, "feature_names_in_"):
    selected_names = list(selector.feature_names_in_[indices])
else:
    selected_names = list(X_train.columns[indices])

print("Selected feature names:", selected_names)
print("used score function:", selector.score_func)

Selected mask: [False  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True]
Selected indices: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
Selected feature names: ['Turnover', 'Open', 'High', 'Low', 'Close', 'Change', 'Transaction', 'MA5', 'MA10', 'MA5_Capacity', 'BR5', 'BR10', 'High-Low', 'Open-Close', 'EMA5', 'EMA10', 'K', 'D', 'MACD', 'MACD_signal', 'MACD_hist', 'RSI14', 'STD20', 'ADX', 'RSI', 'DJI', 'NASDAQ', 'SOX', 'SPX', 'ADR', 'twclose']
used score function: <function mutual_info_classif at 0x000002CE88994CC0>


In [None]:
# prepare test data
if 'twopen' in df_train.columns:
    X_test = df_test.drop(['Date', 'twopen', 'Movement', 'rate'], axis=1).copy()
else:
    X_test = df_test.drop(['Date', 'Movement', 'rate'], axis=1).copy()


labels_test = (df_test['Open'].shift(-1) > df_test['Close']).astype(int)


X_test = X_test.iloc[:-1].reset_index(drop=True)
y_test = labels_test[:-1].reset_index(drop=True).to_frame(name='label')


print("X_test.shape:", X_test.shape, "X_test.type: ", type(X_test), "\n",  "y_test.shape:", y_test.shape, "y_test.type: ", type(y_test))
display(X_test.head(5))
display(y_test.head(5))

print("Number of NaN in X:", X_test.isna().sum().sum())
print("Number of NaN in y:", y_test.isna().sum().sum())

X_test.shape: (205, 32) X_test.type:  <class 'pandas.core.frame.DataFrame'> 
 y_test.shape: (205, 1) y_test.type:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Capacity,Turnover,Open,High,Low,Close,Change,Transaction,MA5,MA10,...,RSI14,STD20,ADX,RSI,DJI,NASDAQ,SOX,SPX,ADR,twclose
0,45045125,47883206644,1070.0,1075.0,1055.0,1065.0,-10.0,74997,1081.0,1075.5,...,50.001872,14.372855,12.693554,50.001872,42392.27,19280.79,5021.5,5868.55,201.58,22832.06
1,31244211,33728652860,1080.0,1085.0,1075.0,1075.0,10.0,28227,1079.0,1076.0,...,52.9128,14.372855,11.889804,52.9128,42732.13,19621.68,5163.6,5942.47,208.61,22908.3
2,77874801,86585128825,1100.0,1125.0,1095.0,1125.0,50.0,130021,1086.0,1085.0,...,64.151249,18.417669,12.643839,64.151249,42706.56,19864.98,5310.1,5975.38,220.01,23547.71
3,54691485,62664127156,1150.0,1160.0,1130.0,1130.0,5.0,70657,1094.0,1090.0,...,65.049589,22.080892,14.349989,65.049589,42528.36,19489.68,5212.2,5909.03,211.42,23651.27
4,49181518,54760296689,1110.0,1130.0,1105.0,1105.0,-25.0,71736,1100.0,1092.5,...,57.315749,22.622474,14.738958,57.315749,42635.2,19478.88,5162.3,5918.25,207.12,23407.33


Unnamed: 0,label
0,1
1,1
2,1
3,0
4,0


Number of NaN in X: 0
Number of NaN in y: 0


In [None]:
y_test_pred = best_model_C.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)
display(acc_test)
display(f1_test)

y_test_pred.to_csv("NB_predict.csv")

0.5560975609756098

0.6255144032921811