In [30]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from datetime import datetime
from sklearn import preprocessing
import os
from sklearn.metrics import *
%matplotlib inline

In [31]:
from keras.models import Sequential,load_model
from keras.layers import Dense, LSTM, BatchNormalization, Dropout, Input, TimeDistributed, RepeatVector
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras import optimizers
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.regularizers import l2
from time import time
from sklearn.preprocessing import MinMaxScaler  # MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.callbacks import Callback,ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend as K
from imblearn.over_sampling import SMOTE
import os
from keras import regularizers
from keras.models import Model

In [32]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [33]:
df = pd.read_csv('./melting_tank.csv',
                 parse_dates=['STD_DT'],
                 infer_datetime_format=True,
                 index_col='STD_DT'
                )
df.drop(['NUM'],axis=1,inplace=True) # NUM column 제거
print(f'data shape = {df.shape}')
df.head(20)

data shape = (835200, 5)


Unnamed: 0_level_0,MELT_TEMP,MOTORSPEED,MELT_WEIGHT,INSP,TAG
STD_DT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-04 00:00:00,489,116,631,3.19,OK
2020-03-04 00:00:00,433,78,609,3.19,OK
2020-03-04 00:00:00,464,154,608,3.19,OK
2020-03-04 00:00:00,379,212,606,3.19,OK
2020-03-04 00:00:00,798,1736,604,3.21,OK
2020-03-04 00:00:00,743,1722,603,3.21,OK
2020-03-04 00:00:00,390,212,602,3.19,OK
2020-03-04 00:00:00,493,152,600,3.19,OK
2020-03-04 00:00:00,427,0,599,3.19,OK
2020-03-04 00:00:00,489,148,598,3.2,OK


In [34]:
encoder = preprocessing.LabelEncoder()
df['TAG'] = encoder.fit_transform(df['TAG']) # categorical variable to numerical variable
df['TAG'] = df['TAG'].astype(int)

In [35]:
split_date = int(df.shape[0]*0.7)
train = df[:split_date]
test = df[split_date:]

In [36]:
scaler = MinMaxScaler()  # MinMaxScaler(feature_range=(-1,1))
train_sc = pd.DataFrame(scaler.fit_transform(train), index=train.index, columns=train.columns)
test_sc = pd.DataFrame(scaler.transform(test), index=test.index, columns=test.columns)

In [37]:
X_train = train_sc.drop(['TAG'],axis=1)
y_train = train_sc['TAG']

In [41]:
X_train.values.shape

(584640, 4)

In [9]:
# Class Imbalance 처리
#smote = SMOTE(random_state=0)
#X_train, y_train = smote.fit_resample(X_train.values,y_train.values)

In [10]:
# Raw data -> Sequence로 만들기
def make_dataset(data, label, window_size):
    feature_list = []
    label_list = []
    for i in range(len(data)-window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

In [11]:
X_train = pd.DataFrame(X_train, columns=['MELT_TEMP', 'MOTORSPEED', 'MELT_WEIGHT', 'INSP'])
y_train = pd.DataFrame(y_train, columns=['TAG'])
train_feature, train_label = make_dataset(X_train, y_train, 10)

In [12]:
X_test = test_sc.drop(['TAG'],axis=1)
y_test = test_sc['TAG']
test_feature, test_label = make_dataset(X_test, y_test, 10)

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size = 0.3)

In [14]:
X_train.shape, y_train.shape

((409241, 10, 4), (409241, 1))

In [15]:
# F1-Score Metric으로 사용하기위한 함수
def get_f1(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [16]:
model_basic = Sequential()
model_basic.add(LSTM(50, input_shape=(train_feature.shape[1], train_feature.shape[2]),
                    activation='tanh', return_sequences=False))
model_basic.add(Dense(1, activation='sigmoid'))

In [20]:
epochs = 30
batch = 32
lr = 0.0001

In [21]:
model_basic.compile(loss='binary_crossentropy',
              metrics=[get_f1],
              optimizer = optimizers.Adam(lr))
early_stop = EarlyStopping(monitor='val_loss', patience=5)
model_basic.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50)                11000     
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 11,051
Trainable params: 11,051
Non-trainable params: 0
_________________________________________________________________


In [22]:
history = model_basic.fit(X_train, y_train, 
                          epochs = epochs, 
                          batch_size = batch, 
                          validation_data=(X_valid, y_valid), 
                          callbacks=[early_stop])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [23]:
pred = model_basic.predict(test_feature)

In [24]:
pred_df = pd.DataFrame(pred, columns=['TAG'])

In [25]:
pred_df['TAG'] = pred_df['TAG'].apply(lambda x:1 if x>= 0.5 else 0)

In [26]:
p = precision_score(test_label, pred_df)
r = recall_score(test_label, pred_df)
f1 = f1_score(test_label, pred_df)
acc = accuracy_score(test_label, pred_df)

In [27]:
confusion_matrix(test_label, pred_df)

array([[  2143,    797],
       [ 66643, 180967]])

In [28]:
print("Precision: %0.4f" %p)
print("Recall: %0.4f" %r)
print("F1-score: %0.4f" %f1)
print("Accuracy: %0.4f" %acc)

Precision: 0.9956
Recall: 0.7309
F1-score: 0.8429
Accuracy: 0.7308
