In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from datetime import datetime
from sklearn import preprocessing
import os
from sklearn.metrics import *
%matplotlib inline

In [None]:
from keras.models import Sequential,load_model
from keras.layers import Dense, LSTM, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam 
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.regularizers import l2
from time import time
from sklearn.preprocessing import MinMaxScaler  # MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.callbacks import Callback,ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K
from imblearn.over_sampling import SMOTE
import os

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

In [None]:
# Raw data -> Sequence
def make_dataset(data, label, window_size):
    feature_list = []
    label_list = []
    for i in range(len(data)-window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

In [None]:
# F1-Score Metric
def get_f1(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
df = pd.read_csv('./melting_tank.csv',
                 parse_dates=['STD_DT'],
                 infer_datetime_format=True,
                 index_col='STD_DT'
                )
df.drop(['NUM'],axis=1,inplace=True) # NUM column 제거
print(f'data shape = {df.shape}')

In [None]:
encoder = preprocessing.LabelEncoder()
df['TAG'] = encoder.fit_transform(df['TAG']) # categorical variable to numerical variable
df['TAG'] = df['TAG'].astype('float32')

In [None]:
split_date = int(df.shape[0]*0.7)
train = df[:split_date]
test = df[split_date:]

In [None]:
scaler = MinMaxScaler()  # MinMaxScaler(feature_range=(-1,1))
train_sc = pd.DataFrame(scaler.fit_transform(train), index=train.index, columns=train.columns)
test_sc = pd.DataFrame(scaler.transform(test), index=test.index, columns=test.columns)

# Oversampling

## Random Oversampling

In [None]:
X_train = train_sc.drop(['TAG'],axis=1)
y_train = train_sc['TAG']

In [None]:
X_train = pd.DataFrame(X_train, columns=['MELT_TEMP', 'MOTORSPEED', 'MELT_WEIGHT', 'INSP'])
y_train = pd.DataFrame(y_train, columns=['TAG'])

In [None]:
#Random OverSampling
ros = RandomOverSampler(sampling_strategy='minority') # Equal to the number of major class
ros_feature, ros_label = ros.fit_resample(X_train, y_train)

In [None]:
train_feature, train_label = make_dataset(ros_feature, ros_label, 10)

In [None]:
ros_feature = np.array([train_feature[i] for i in idx]).reshape(-1,10,4)

In [None]:
X_test = test_sc.drop(['TAG'],axis=1)
y_test = test_sc['TAG']
test_feature, test_label = make_dataset(X_test, y_test, 10)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size = 0.3)

In [None]:
model_basic = Sequential()
model_basic.add(LSTM(50, input_shape=(ros_feature.shape[1], ros_feature.shape[2]),
                    activation='tanh', return_sequences=False))
model_basic.add(Dense(1, activation='sigmoid'))

In [None]:
model_basic.compile(loss='binary_crossentropy',
              metrics=[get_f1],
              optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
model_basic.summary()

In [None]:
history = model_basic.fit(X_train, y_train, 
                          epochs = 30, 
                          batch_size = 32, 
                          validation_data=(X_valid, y_valid), 
                          callbacks=[early_stop])

In [None]:
pred = model_basic.predict(test_feature)

In [None]:
pred_df = pd.DataFrame(pred, columns=['TAG'])

In [None]:
pred_df['TAG'] = pred_df['TAG'].apply(lambda x:1 if x>= 0.5 else 0)

In [None]:
p = precision_score(test_label, pred_df)
r = recall_score(test_label, pred_df)
f1 = f1_score(test_label, pred_df)
acc = accuracy_score(test_label, pred_df)

print("Precision: %0.4f" %p)
print("Recall: %0.4f" %r)
print("F1-score: %0.4f" %f1)
print("Accuracy: %0.4f" %acc)

## SMOTE

In [None]:
X_train= train_sc.drop(['TAG'],axis=1)
y_train = train_sc['TAG']

In [None]:
#SMOTE
smote = SMOTE(sampling_strategy = 'minority')
x_sm, y_sm = smote.fit_resample(X_train.values, y_train.values)

In [None]:
train_feature, train_label = make_dataset(pd.DataFrame(x_sm), pd.DataFrame(y_sm), 10)

In [None]:
X_test = test_sc.drop(['TAG'],axis=1)
y_test = test_sc['TAG']
test_feature, test_label = make_dataset(X_test, y_test, 10)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size = 0.3)

In [None]:
model_basic = Sequential()
model_basic.add(LSTM(50, input_shape=(train_feature.shape[1], train_feature.shape[2]),
                    activation='tanh', return_sequences=False))
model_basic.add(Dense(1, activation='sigmoid'))

In [None]:
model_basic.compile(loss='binary_crossentropy',
              metrics=[get_f1],
              optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
model_basic.summary()

In [None]:
history = model_basic.fit(X_train, y_train, 
                          epochs = 30, 
                          batch_size = 32, 
                          validation_data=(X_valid, y_valid), 
                          callbacks=[early_stop])

In [None]:
pred = model_basic.predict(test_feature)

In [None]:
pred_df = pd.DataFrame(pred, columns=['TAG'])

In [None]:
pred_df['TAG'] = pred_df['TAG'].apply(lambda x:1 if x>= 0.5 else 0)

In [None]:
p = precision_score(test_label, pred_df)
r = recall_score(test_label, pred_df)
f1 = f1_score(test_label, pred_df)
acc = accuracy_score(test_label, pred_df)

print("Precision: %0.4f" %p)
print("Recall: %0.4f" %r)
print("F1-score: %0.4f" %f1)
print("Accuracy: %0.4f" %acc)

# Undersampling

## Random Undersampling

In [None]:
X_train = train_sc.drop(['TAG'],axis=1)
y_train = train_sc['TAG']

In [None]:
X_train = pd.DataFrame(X_train, columns=['MELT_TEMP', 'MOTORSPEED', 'MELT_WEIGHT', 'INSP'])
y_train = pd.DataFrame(y_train, columns=['TAG'])

In [None]:
# Random UnderSampling
rus = RandomOverSampler(sampling_strategy='minority')
rus_feature, rus_label = rus.fit_resample(X_train, y_train) # Equal to the number of minor class

In [None]:
# Make Sequence
train_feature, train_label = make_dataset(rus_feature, rus_label, 10)

In [None]:
X_test = test_sc.drop(['TAG'],axis=1)
y_test = test_sc['TAG']
test_feature, test_label = make_dataset(X_test, y_test, 30)

In [None]:
pd.DataFrame(rus_label).value_counts()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size = 0.3)

In [None]:
model_basic = Sequential()
model_basic.add(LSTM(50, input_shape=(train_feature.shape[1], train_feature.shape[2]),
                    activation='tanh', return_sequences=False))
model_basic.add(Dense(1, activation='sigmoid'))

In [None]:
model_basic.compile(loss='binary_crossentropy',
              metrics=[get_f1],
              optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
model_basic.summary()

In [None]:
history = model_basic.fit(X_train, y_train, 
                          epochs = 30, 
                          batch_size = 32, 
                          validation_data=(X_valid, y_valid), 
                          callbacks=[early_stop])

In [None]:
pred = model_basic.predict(test_feature)

In [None]:
pred_df = pd.DataFrame(pred, columns=['TAG'])

In [None]:
pred_df['TAG'] = pred_df['TAG'].apply(lambda x:1 if x>= 0.5 else 0)

In [None]:
p = precision_score(test_label, pred_df)
r = recall_score(test_label, pred_df)
f1 = f1_score(test_label, pred_df)
acc = accuracy_score(test_label, pred_df)

print("Precision: %0.4f" %p)
print("Recall: %0.4f" %r)
print("F1-score: %0.4f" %f1)
print("Accuracy: %0.4f" %acc)

## Tomek Links

In [None]:
X_train = train_sc.drop(['TAG'],axis=1)
y_train = train_sc['TAG']

In [None]:
# Tomek Links
tl = TomekLinks(sampling_strategy='majority')
tl_feature, tl_label = tl.fit_resample(X_train.values, y_train.values)

In [None]:
train_feature, train_label = make_dataset(pd.DataFrame(tl_feature), pd.DataFrame(tl_label), 10)

In [None]:
X_test = test_sc.drop(['TAG'],axis=1)
y_test = test_sc['TAG']
test_feature, test_label = make_dataset(X_test, y_test, 10)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size = 0.3)

In [None]:
model_basic = Sequential()
model_basic.add(LSTM(50, input_shape=(train_feature.shape[1], train_feature.shape[2]),
                    activation='tanh', return_sequences=False))
model_basic.add(Dense(1, activation='sigmoid'))

In [None]:
model_basic.compile(loss='binary_crossentropy',
              metrics=[get_f1],
              optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
model_basic.summary()

In [None]:
history = model_basic.fit(X_train, y_train, 
                          epochs = 30, 
                          batch_size = 32, 
                          validation_data=(X_valid, y_valid), 
                          callbacks=[early_stop])

In [None]:
pred = model_basic.predict(test_feature)

In [None]:
pred_df = pd.DataFrame(pred, columns=['TAG'])

In [None]:
pred_df['TAG'] = pred_df['TAG'].apply(lambda x:1 if x>= 0.5 else 0)

In [None]:
p = precision_score(test_label, pred_df)
r = recall_score(test_label, pred_df)
f1 = f1_score(test_label, pred_df)
acc = accuracy_score(test_label, pred_df)

print("Precision: %0.4f" %p)
print("Recall: %0.4f" %r)
print("F1-score: %0.4f" %f1)
print("Accuracy: %0.4f" %acc)

# Hybrid

## SMOTETomek

In [None]:
X_train = train_sc.drop(['TAG'],axis=1)
y_train = train_sc['TAG']

In [None]:
#SMOTETomek
smtt = SMOTETomek()
x_smtt, y_smtt = smtt.fit_resample(X_train, y_train)

In [None]:
train_feature, train_label = make_dataset(pd.DataFrame(x_smtt), pd.DataFrame(y_smtt), 10)

In [None]:
X_test = test_sc.drop(['TAG'],axis=1)
y_test = test_sc['TAG']
test_feature, test_label = make_dataset(X_test, y_test, 10)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size = 0.3)

In [None]:
model_basic = Sequential()
model_basic.add(LSTM(50, input_shape=(train_feature.shape[1], train_feature.shape[2]),
                    activation='tanh', return_sequences=False))
model_basic.add(Dense(1, activation='sigmoid'))

In [None]:
model_basic.compile(loss='binary_crossentropy',
              metrics=[get_f1],
              optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
model_basic.summary()

In [None]:
history = model_basic.fit(X_train, y_train, 
                          epochs = 30, 
                          batch_size = 32, 
                          validation_data=(X_valid, y_valid), 
                          callbacks=[early_stop])

In [None]:
pred = model_basic.predict(test_feature)

In [None]:
pred_df = pd.DataFrame(pred, columns=['TAG'])

In [None]:
pred_df['TAG'] = pred_df['TAG'].apply(lambda x:1 if x>= 0.5 else 0)

In [None]:
p = precision_score(test_label, pred_df)
r = recall_score(test_label, pred_df)
f1 = f1_score(test_label, pred_df)
acc = accuracy_score(test_label, pred_df)

In [None]:
print("tn, fp, fn, tp =",confusion_matrix(test_label,pred_df).ravel())
print("\n")
print("Precision: %0.4f" %p)
print("Recall: %0.4f" %r)
print("F1-score: %0.4f" %f1)
print("Accuracy: %0.4f" %acc)