# 가장 좋은 결과를 낼 수 있는 feature항목 추출

## 모든 feature를 사용한 결과와, 선택 추출된 feature만 사용한 결과 정확도에 차이가 남
#### logistic 회귀 이용하여 coef_ 항목에서 영향력이 높은 feature를 선택. 최적의 갯수 선택

### 데이터 준비하기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score

In [3]:
import pickle
def list_to_pickle(filename, listname):
    open_file = open(filename, "wb")
    pickle.dump(listname, open_file)
    open_file.close()

def list_from_pickle(filename):
    open_file = open(filename, "rb")
    loaded_list = pickle.load(open_file)
    open_file.close()
    return loaded_list

In [4]:
# 분석용 데이터 입력
stock_name = 'naver'
directory_for_ml = '../data/data_for_ml/'
fname = f'df_{stock_name}_sel.pkl'
f_name = directory_for_ml + fname
df = pd.read_pickle(f_name)
plt_title = fname[3:7]

In [5]:
df.drop(columns=['value', 'r_open_high_5', 'close_cr_1', 'r_open_high_1'], inplace=True )

In [6]:
new_columns = ['kospi_cr', 'kosdaq_cr', 'dxy_cr', 'spx_cr', 'krw_cr', 'financeetc',
               'bank', 'bond_usa_10_cr', 'spx_f_cr', 'bond_kor_10_cr', 'close_cr_5']

In [7]:
df_new = df[new_columns]

In [8]:
num_data = 220
data = df_new.iloc[:num_data, :-1]
data_test = df_new.iloc[num_data:, :-1]
target = df_new.iloc[:num_data, -1]

In [9]:
train_input, test_input, train_target, test_target = train_test_split(data, target, random_state=42, test_size=0.2, stratify=target)

In [10]:
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)
data_test_scaled = ss.transform(data_test)

In [11]:
def model_fn(inp_num, a_layer=None):
    model = Sequential()
    model.add(Dense(12, activation='relu', input_shape=(inp_num,)))
#     model.add(Dropout(0.1))
    model.add(Dense(6, activation='sigmoid'))
#     model.add(Dropout(0.1))
    model.add(Dense(3, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [12]:
d_rate = 1e-7 # default value
adam_custom = tf.keras.optimizers.Adam(
    learning_rate=0.000005, # default  0.001 , best fit 0.0001 for skhinix
    beta_1=0.9, beta_2=0.999, epsilon=d_rate, amsgrad=False,  # default
    weight_decay=None, clipnorm=None, clipvalue=None, global_clipnorm=None, use_ema=False, ema_momentum=0.99,
    ema_overwrite_frequency=None, jit_compile=True, name='Adam' )

In [13]:
# model = model_fn(10, len(data_new.columns), 2, Dropout(0.3))
try :
    model = None
except:
    pass

model = model_fn(len(data.columns))
# model.compile(optimizer='adam', loss='binary_crossentropy', 
#               metrics=['accuracy'])
model.compile(optimizer=adam_custom , loss='binary_crossentropy', 
              metrics=['accuracy'])
# model.summary()

In [14]:
# checkpoint_cb = ModelCheckpoint('best_model.h5', save_best_only=True)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    filepath='best_model/best_model_{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.h5', \
     save_best_only=True)
# checkpoint_cb = keras.callbacks.ModelCheckpoint(filepath='best_model/skhinix_model.h5', \
#                                                 monitor='val_accuracy', mode='max', save_best_only=True)
# checkpoint_cb = keras.callbacks.ModelCheckpoint(filepath='best_model/skhinix_model.h5', save_best_only=True)
# earlystopping_cb = keras.callbacks.EarlyStopping(patience=100, monitor='val_accuracy', mode='max', restore_best_weights=True)
earlystopping_cb = keras.callbacks.EarlyStopping(patience=200, monitor='val_loss', mode='min', restore_best_weights=True)
# earlystopping_cb = keras.callbacks.EarlyStopping(patience=100, restore_best_weights=True)

reducelr = tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.9,  
                                                patience=100, verbose=0, mode='auto', min_delta=0.0001, #0.0001
                                                cooldown=0, min_lr=0 )

def scheduler(epoch, lr):
    if epoch < 1000:
        print("epoch, m, lr", epoch, m, lr)
        return lr
    else:
        print("epoch, m, lr", epoch, m, lr)
        return lr * tf.math.exp(-0.01)

lrscheduler = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=0)

In [15]:
history = model.fit(train_scaled, train_target, epochs=30000, verbose=0, batch_size=20,
#                     callbacks=[checkpoint_cb, earlystopping_cb, reducelr],
                    callbacks=[earlystopping_cb, reducelr],
                    validation_data=(test_scaled, test_target))

# loss 가 최저로 내려가지 않거나 큰 상태에서 머무르면 adam (optimizer)의 learning rate를 줄이면서 loss가 작아지는지 시도해 볼 것
# batch size 도 조절
# layer를 추가하면서 진행도 많이 도움됨.(mobis 경우에 적용) -> 과대적합이 됨.
# batch_size를 늘리거나 줄이면 val_loss의 꺽이는 영역이 위아래로 바뀌니 lr을 고정하고 조금씩 조정하면서 최적의 사이즈를 찾아야 함.
# --> 이후 lr을 아주 조금씩 줄이면서 시도함. (0.0001 -> 0.00009 -> 0.00008 )

In [24]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train', 'val'])
plt.title(plt_title)
plt.show()

In [17]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['accuracy', 'val_accuracy'])
plt.title(plt_title)
plt.show()

In [25]:
# 정밀도 : 양성으로 예측된 것(TP+FP) 중 얼마나 많은 샘플이 진짜 양성(TP)인지 측정
model.evaluate(test_scaled, test_target)
y_predict = model.predict(np.array(test_scaled))
y_predict_list = [1 if i > 0.5 else 0 for i in y_predict[:, 0]]
print("정밀도", precision_score(test_target, y_predict_list)) 
print("재현율", recall_score(test_target, y_predict_list))
print("f1_acore", f1_score(test_target, y_predict_list))
print("roc_auc_score", roc_auc_score(test_target, y_predict_list))

print("confusion matrix", confusion_matrix(test_target, y_predict_list))

In [19]:
data_test_target = df_new.iloc[num_data:, -1]
model.evaluate(data_test_scaled, data_test_target)
y_predict = model.predict(np.array(data_test_scaled))
y_predict_list = [1 if i > 0.5 else 0 for i in y_predict[:, 0]]
print("정밀도", precision_score(data_test_target, y_predict_list)) 
print("재현율", recall_score(data_test_target, y_predict_list))
print("f1_acore", f1_score(data_test_target, y_predict_list))
print("roc_auc_score", roc_auc_score(data_test_target, y_predict_list))

print("confusion matrix", confusion_matrix(data_test_target, y_predict_list))

In [26]:
compare = [ [x, y] for x, y in zip(data_test_target, y_predict)]
# compare = [ [x, y] for x, y in zip(test_target, y_predict_list)]
compare # 실제값. 예측값

In [21]:
data_test_target

In [22]:
df_new.info()

In [23]:
model.predict(np.array(data_test_scaled))