### Model


#### Loading data

In [None]:
#Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import glob
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, BatchNormalization
from tensorflow import keras 


#### ETL

In [None]:
#Choose whether you will run the ETL or not

choice = 0; #1 will run the ETL

save_model = 1; #1 will save a new model in .h5 format

prediction_file = ""; #File which prediction will occur

#If choice is one, choose the portion size of train and test

SIZE = 250000 #Size of the selected rows for training purposes

if choice == 1:  
  print("Setup for test size is:", SIZE)
  print("Setup for train size is:", int(SIZE+SIZE*0.2))
else:
  print("Skipping the ETL on next step...")
  

In [None]:
if choice == 1:
  
  files_no = glob.glob('normal/*.csv')
  files_no.sort() #In-list sorting
  folders_im = glob.glob('imbalance/*')
  folders_im.sort() #In-list sorting
  train_data = np.empty((0,8), float)
  test_data = np.empty((0,8), float)

  for f_on in files_no:
    source_data = np.loadtxt(f_on, delimiter=",")
    train_data = np.append(train_data, source_data[0:SIZE,:], axis=0)
    test_data = np.append(test_data, source_data[SIZE:int(SIZE+SIZE*0.25),:], axis=0)

  n_train_labels = len(train_data)
  n_test_labels = len(test_data)
  
  SIZE, MOD = divmod(SIZE,6)
  
  for folder in folders_im:
    files_im = glob.glob( folder +'/*.csv')
    files_im.sort() #In-list sorting
    for f_im in files_im:
      source_data = np.loadtxt(f_im, delimiter=",")
      train_data = np.append(train_data, source_data[0:SIZE,:], axis=0)
      test_data = np.append(test_data, source_data[SIZE:int(SIZE+SIZE*0.25),:], axis=0)
  
  np.savetxt("train_data.csv", train_data, delimiter=",")
  np.savetxt("test_data.csv", test_data, delimiter=",")
  
  train_label = np.zeros(n_train_labels)
  train_label = np.append(train_label, np.ones(len(train_data)-n_train_labels), axis=0)
  test_label = np.zeros(n_test_labels)
  test_label = np.append(test_label, np.ones(len(test_data)-n_test_labels), axis=0)
  
  np.savetxt("train_label.csv", train_label, delimiter=",")
  np.savetxt("test_label.csv", test_label, delimiter=",")
  print("Finished parsing the files")

else:
  print("Skipping this step...")


In [None]:
#Loading data to variables
if choice == 0:
  train_data = np.loadtxt('train_data.csv', delimiter=",")
  test_data = np.loadtxt('test_data.csv', delimiter=",")
  train_label = np.loadtxt('train_label.csv', delimiter=",")
  test_label = np.loadtxt('test_label.csv', delimiter=",")
  print("Loaded data files...")

else:
  print("Skipped loading files...")


In [None]:
#Checking the 'train' and 'test' shapes
print(train_data.shape)
print(test_data.shape)


In [None]:
#Applying MinMaxScaler scaler
scaler = MinMaxScaler(feature_range=(0, 1))
train_data = scaler.fit_transform(train_data)
test_data = scaler.fit_transform(test_data)


In [None]:
#Loading the model

if save_model == 0:
  model = keras.models.load_model('mafaulda.h5')
  print("Model loaded...")
else:
  print("Skipping loading model...")


In [None]:
input_size,input_len = train_data.shape
input_size


In [None]:
#LSTM Model
if save_model == 1:
  model = Sequential()
  model.add(LSTM(12, input_shape=(1, 8),activation='tanh', recurrent_activation='sigmoid',return_sequences=True, return_state=False))
  model.add(BatchNormalization())
  model.add(LSTM(8, activation='tanh', recurrent_activation='sigmoid'))
  model.add(BatchNormalization())
  model.add(Dense(64))
  model.add(Dense(12))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  print("Model created...")
else:
  print("Skipped model creation...")


In [None]:
#Reshapping the data
train_LSTM = train_data.reshape((train_data.shape[0], 1, train_data.shape[1]))
test_LSTM = test_data.reshape((test_data.shape[0], 1, test_data.shape[1]))


In [None]:
#Displaying 'train' and 'test'shapes
display("Train shape", train_LSTM.shape)
print("\n")
display("Test shape", test_LSTM.shape)


In [None]:
#Compile the model
if save_model == 1:
  history = model.fit(train_LSTM, train_label, epochs=10, batch_size=5, validation_data=(test_LSTM, test_label), verbose=2, shuffle=False)
else:
  print("Skipping model compilation because it was loaded from a saved one.")


In [None]:
#Ploting Training Accuracy and Validation Accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()


In [None]:
#Saving the model
if save_model == 1:
  model.save('mafaulda.h5')
  !cp mafaulda.h5 drive/MyDrive/MAFAULDA/ #Copying it to Drive
  print("Model saved as mafaulda.h5")
else:
  print("Skipping model saving...")


In [None]:
accuracy = model.evaluate(train_LSTM, train_label)
display(accuracy)


In [None]:
test_accuracy = model.evaluate(test_LSTM, test_label)
display(test_accuracy)


### Predictions

In [None]:
#Loading a new data to predict
source_data = np.loadtxt("test_data.csv", delimiter=",")
display(source_data.shape)
display(source_data)


In [None]:
#Applying MinMaxScaler scaler to predict the data
scaler = MinMaxScaler(feature_range=(0, 1))
source_data = scaler.fit_transform(source_data)
display(source_data.shape)
display(source_data)


In [None]:
pred_data = source_data.reshape((source_data.shape[0], 1, source_data.shape[1]))
pred_data.shape


In [None]:
#Predicting data
y = model.predict_classes(pred_data, batch_size=5, verbose=1)


In [None]:
display(len(source_data))
display(y.sum())
display(y.sum()/len(source_data))


In [None]:
#Loading a new data to predict (Normal data)
import os

dictio = {}

for file in map(os.path.basename, glob.glob('normal/*')):
  
  source_data = np.loadtxt('normal/'+ file, delimiter=",")
  source_data = source_data[0:2500,:]
  display(source_data.shape)
  pred_data = source_data.reshape((source_data.shape[0], 1, source_data.shape[1]))
  y = model.predict_classes(pred_data, batch_size=5, verbose=1)
  print(file,y.sum()/len(source_data))
  dictio[file] = y.sum()/len(source_data)

dictio


In [None]:
#Loading a new data to predict (Imbalance data)
dictio_imbalance = {}

for file in map(os.path.basename, glob.glob('imbalance/25g/*')):
  
  source_data = np.loadtxt('imbalance/25g/'+ file, delimiter=",")
  source_data = source_data[0:2500,:]
  display(source_data.shape)
  pred_data = source_data.reshape((source_data.shape[0], 1, source_data.shape[1]))
  y = model.predict_classes(pred_data, batch_size=5, verbose=1)
  print(file,y.sum()/len(source_data))
  dictio_imbalance[file] = y.sum()/len(source_data)

dictio_imbalance
