# Load libraries & parameters

In [None]:
%matplotlib inline
import itertools as it
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np 
import pandas as pd 
import math
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.utils import to_categorical
from keras.utils import np_utils
import seaborn as sns
import scipy.cluster.hierarchy as spc
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from jmetal.algorithm.singleobjective.genetic_algorithm import GeneticAlgorithm
from jmetal.operator import BinaryTournamentSelection
from jmetal.operator.crossover import PMXCrossover
from jmetal.operator.mutation import PermutationSwapMutation
from jmetal.util.termination_criterion import StoppingByEvaluations
from jmetal.util.observer import PrintObjectivesObserver
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support, recall_score
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
import datetime
import pickle
from pyod.models.auto_encoder import AutoEncoder
from sklearn.metrics import confusion_matrix
import dill

# parameter
null_percentage = 0.1
test_ratio = 0.2
random_state = 1

epochs = 200
neurons = [32, 18, 18, 32]
k_best = 120
m_subsets = 3
assert k_best % m_subsets == 0
n_features = int(k_best/m_subsets)

# Feature correlation selection

In [None]:
from jmetal.core.problem import PermutationProblem
from jmetal.core.solution import PermutationSolution
import random

# Feature correlation selection
class FCS(PermutationProblem):

    def __init__(self, correlation, variables, subsets):
        super(FCS, self).__init__()

        self.correlation = correlation
        self.subsets = subsets
        self.n_features = int(variables/subsets)
        
        # Minimization to find most independent between variable 
        self.obj_directions = [self.MINIMIZE]
        self.number_of_variables = variables
        self.number_of_objectives = 1
        self.number_of_constraints = 0

    def evaluate(self, solution: PermutationSolution) -> PermutationSolution:
        fitness = 0

        for i in range(self.subsets):

          end = self.n_features*(i+1)
          start = end - self.n_features

          comb = list(it.combinations(solution.variables[start:end-1], 2))

          for j in comb:
              fitness += self.correlation[j[0]][j[1]]

          solution.objectives[0] = fitness

        return solution

    def create_solution(self) -> PermutationSolution:
        new_solution = PermutationSolution(number_of_variables=self.number_of_variables,
                                           number_of_objectives=self.number_of_objectives)
        new_solution.variables = random.sample(range(self.number_of_variables), k=self.number_of_variables)
        return new_solution
    
    def get_name(self):
        return 'Feature correlation selection'

# Load dataset

In [None]:
#  reading the data
data = pd.read_csv('shared/uci-secom.csv')
data = data.drop(['Time'], axis = 1)
data.loc[data['Fault'] == -1, 'Fault'] = 0

# getting the shape of the data
print(data.shape)
data.head()

In [None]:
output_labels = data['Fault'].value_counts()
fault_fraction = output_labels[1]/float(output_labels[0])
print('fault_fraction', fault_fraction)
print(output_labels)
data.describe()

# Preprocessing

## Data cleaning

In [None]:
def drop_constant_column(dataframe):
    for column in dataframe.columns:
        unique = dataframe[column].unique()
        unique = unique[~np.isnan(unique)]
        if len(unique) == 1:
            dataframe.drop(column,inplace=True,axis=1)
    return dataframe

# Drop columns with constant value
data = drop_constant_column(data)

# Only keep columns with NaN values below percentage
data = data.loc[:, data.isnull().mean() < null_percentage]
data.head()

## Data imputation

In [None]:
data.replace(np.nan, 0, inplace = True)
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer.fit(data)
# data = pd.DataFrame(imputer.transform(data), columns=data.columns)
print("Is there any null?", data.isnull().any().any())
data.head()

## Data normalization

In [None]:
from sklearn import preprocessing
#returns a numpy array
x = data.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
data = pd.DataFrame(x_scaled, columns=data.columns, index=data.index)

data.to_pickle('shared/data_preprocessed.pkl')
data.head()

## Dataset splitting

In [None]:
# split data by class
fail_test_set = data[data['Fault'] == 1]
pass_set = data[data['Fault'] == 0]

n_pass_test_set = 210

# randomly select and add pass data to become test set
train_set, pass_test_set = train_test_split(pass_set, test_size = n_pass_test_set, random_state = random_state)
test_set = pd.concat([fail_test_set, pass_test_set])

# shuffle test set
test_set = test_set.reindex(np.random.permutation(test_set.index))

x_train = train_set.iloc[:, :-1]
y_train = train_set.iloc[:, -1]

x_test = test_set.iloc[:, :-1]
y_test = test_set.iloc[:, -1]

print('Train set', x_train.shape)
print(y_train.value_counts())
print('Test set', x_test.shape)
print(y_test.value_counts())
print("Is there any null?", data.isnull().any().any())

x_train.to_pickle('shared/x_train.pkl')
y_train.to_pickle('shared/y_train.pkl')
x_test.to_pickle('shared/x_test.pkl')
y_test.to_pickle('shared/y_test.pkl')

# Edge deployment

## Select k features (most correlated to output)

In [None]:
# using all data
x_all = data.iloc[:, :-1]
y_all = data.iloc[:, -1]

selector = SelectKBest(f_classif, k=k_best)
k_features = selector.fit_transform(x_all, y_all)
k_best_index = x_all.columns[selector.get_support()]

with open('shared/k_best_index.pkl', 'wb') as filename:
    pickle.dump(k_best_index.tolist, filename)

k_best_names = x_all.columns.values[selector.get_support()]
k_best_scores = selector.scores_[selector.get_support()]
k_best_names_scores = list(zip(k_best_names, k_best_scores))
ns_df = pd.DataFrame(data = k_best_names_scores, columns=['Feature', 'F_Scores'])
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feature'], ascending = [False, True])
ns_df_sorted.plot(kind='barh', figsize=(20, 20))
plt.savefig('shared/F_Scores.png')
plt.show()

In [None]:
data_importance = data.loc[:, k_best_index]
data_importance.head()

## Generate m subsets of n features (least correlated to each other)

In [None]:
#get correlations of each features in dataset
corrmat = abs(data_importance.corr())
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(abs(data_importance[top_corr_features].corr()),annot=False,cmap="RdYlGn")
plt.savefig('shared/P_Scores.png')

In [None]:
problem = FCS(abs(corrmat.values), k_best, m_subsets)

algorithm = GeneticAlgorithm(
        problem=problem,
        population_size=100,
        offspring_population_size=100,
        mutation=PermutationSwapMutation(1.0 / k_best),
        crossover=PMXCrossover(0.8),
        selection=BinaryTournamentSelection(),
        termination_criterion=StoppingByEvaluations(max=200000)
    )

algorithm.observable.register(observer=PrintObjectivesObserver(10000))

algorithm.run()
result = algorithm.get_result()

print('Algorithm: {}'.format(algorithm.get_name()))
print('Problem: {}'.format(problem.get_name()))
print('Solution: {}'.format(result.variables))
print('Fitness: {}'.format(result.objectives[0]))
print('Computing time: {}'.format(algorithm.total_computing_time))

In [None]:
features_allsubset = data.columns[result.variables]
feature_subsets = []
subset = []
for i in features_allsubset:
    subset.append(str(i))
    if len(subset) == n_features:
        feature_subsets.append(subset)
        subset = []
        
print(feature_subsets)
with open('shared/feature_subsets.pkl', 'wb') as filename:
    pickle.dump(feature_subsets, filename)

In [None]:
for subset_i in range(m_subsets):
    temp = data.loc[:, feature_subsets[subset_i]]
    corrmat = abs(temp.corr())
    top_corr_features = corrmat.index
    plt.figure(figsize=(20,20))
    #plot heat map
    g=sns.heatmap(abs(temp[top_corr_features].corr()),annot=False,cmap="RdYlGn")
    plt.savefig('edge-m{}/P_Scores.png'.format(subset_i+1))

# Training each subset

In [None]:
# Train on each edge
for subset_i in range(m_subsets):
    print(feature_subsets[subset_i])
    x_train_subset = x_train.loc[:, feature_subsets[subset_i]]
    x_test_subset = x_test.loc[:, feature_subsets[subset_i]]
    
    clf_name = 'AutoEncoder'
    clf = AutoEncoder(epochs=epochs, hidden_neurons=neurons, contamination=fault_fraction, validation_size=0, random_state=random_state)
    clf.fit(x_train_subset)
    title = 'edge-m{}/autoencoder.h5'.format(subset_i+1)
    with open(title, 'wb') as f:
        dill.dump(clf, f, dill.HIGHEST_PROTOCOL)

# Train on Cloud-k
# x_train_subset = x_train.loc[:, k_best_index]
# x_test_subset = x_test.loc[:, k_best_index]
# clf_name = 'AutoEncoder'
# clf = AutoEncoder(epochs=epochs, hidden_neurons=neurons, contamination=fault_fraction, validation_size=0, random_state=random_state)
# clf.fit(x_train_subset)
# title = 'cloud-k/autoencoder.h5'
# with open(title, 'wb') as f:
#     dill.dump(clf, f, dill.HIGHEST_PROTOCOL)

# Train on Cloud-all
# x_train_subset = x_train
# x_test_subset = x_test
# clf_name = 'AutoEncoder'
# clf = AutoEncoder(epochs=epochs, hidden_neurons=neurons, contamination=fault_fraction, validation_size=0, random_state=random_state)
# clf.fit(x_train_subset)
# title = 'cloud-all/autoencoder.h5'
# with open(title, 'wb') as f:
#     dill.dump(clf, f, dill.HIGHEST_PROTOCOL)

# Prediction

In [None]:
# Server
import socket
import json
import numpy as np
import dill


title = 'edge-m1/autoencoder.h5'
with open(title, 'rb') as pickle_file:
    clf = dill.load(pickle_file)

host = '192.168.10.3'  # Server ip
port = 4000
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.bind((host, port))

print("Server Started")
while True:
    data, addr = s.recvfrom(1024)
    data = json.loads(data.decode())
    x_test_subset = np.array(data.get("msg"))
    print("Message from: " + str(addr))
    print("From connected user: ", len(x_test_subset))
    x_test_subset = np.array(x_test_subset).reshape(1, -1)
    y_test_pred = clf.predict(x_test_subset)
    message = json.dumps({"msg": np.array(y_test_pred).tolist()})
    s.sendto(message.encode(), addr)

In [None]:
# Client
# import datetime
# import numpy as np

# m_subsets = 3
# x_test = np.load('x_test.npy')
# feature_subsets = np.load('feature_subsets.npy')
# k_best_index = np.load('k_best_index.npy')

# # Test on each edge
# subset_i = 0
# x_test_subset = x_test.loc[:, feature_subsets[subset_i]]
# start = datetime.datetime.now()
# #     send
# end = datetime.datetime.now()
# interval = end - start
    
# # Test on Cloud-k
# x_test_subset = x_test.loc[:, k_best_index]
# start = datetime.datetime.now()
# #     send
# end = datetime.datetime.now()
# interval = end - start

# # Test on Cloud-all
# x_test_subset = x_test
# start = datetime.datetime.now()
# #     send
# end = datetime.datetime.now()
# interval = end - start

In [None]:

# # get the prediction on the test data
# start = datetime.datetime.now()
# y_test_pred = clf.predict(x_test_subset)  # outlier labels (0 or 1)
# end = datetime.datetime.now()
# interval_test = end - start

# # Confusion matrix and classification report

# matrix = confusion_matrix(y_test, y_test_pred)
# # matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
# sns.heatmap(matrix,annot=True,cbar=False, fmt='g')
# plt.ylabel('True Label')
# plt.xlabel('Predicted Label')
# plt.title('Confusion matrix of subset {}'.format(subset_i+1))
# plt.savefig('subset{}-confusionmatrix.png'.format(subset_i))

# y_test_scores = clf.decision_function(x_test_subset)  # outlier scores
# print('Subset {}'.format(subset_i+1))
# print(y_test.value_counts())
# print('roc_auc_score', roc_auc_score(y_test, y_test_scores))
# print('interval test', interval_test)
# # evaluate and print the results
# print("\nOn Test Data:")
# evaluate_print(clf_name, y_test, y_test_scores)
# print(classification_report(y_test, y_test_pred))

## ROC graph

In [None]:
# subset0_y_test_pred = np.load('edge-m1/y_test_pred.npy')
# subset1_y_test_pred = np.load('edge-m2/y_test_pred.npy')
# subset2_y_test_pred = np.load('edge-m3/y_test_pred.npy')
# subset80_y_test_pred = np.load('subset80/y_test_pred.npy')
# subset90_y_test_pred = np.load('subset90/y_test_pred.npy')

# subset0_y_test_scores = np.load('edge-m1/y_test_scores.npy')
# subset1_y_test_scores = np.load('edge-m2/y_test_scores.npy')
# subset2_y_test_scores = np.load('edge-m3/y_test_scores.npy')
# subset80_y_test_scores = np.load('cloud-k/y_test_scores.npy')
# subset90_y_test_scores = np.load('cloud-all/y_test_scores.npy')

# fpr0, tpr0, thresholds0 = roc_curve(y_test, subset0_y_test_scores)
# fpr1, tpr1, thresholds1 = roc_curve(y_test, subset1_y_test_scores)
# fpr2, tpr2, thresholds2 = roc_curve(y_test, subset2_y_test_scores)
# fpr80, tpr80, thresholds80 = roc_curve(y_test, subset80_y_test_scores)
# fpr90, tpr90, thresholds90 = roc_curve(y_test, subset90_y_test_scores)

# plt.plot(fpr0, tpr0, label='ROC curve Edge-m1')
# plt.plot(fpr1, tpr1, label='ROC curve Edge-m2')
# plt.plot(fpr2, tpr2, label='ROC curve Edge-m3')
# plt.plot(fpr80, tpr80, label='ROC curve Cloud-k')
# plt.plot(fpr90, tpr90, label='ROC curve Cloud-all')
# plt.plot([0, 1], [0, 1], 'k--', label='Baseline')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.xlim([-0.02, 1])
# plt.ylim([0, 1.02])
# plt.legend(loc="lower right")
# plt.savefig('shared/roccurves.png')

# plt.plot(fpr80, tpr80, label='ROC curve Cloud-k')
# plt.plot(fpr90, tpr90, label='ROC curve Cloud-all')
# plt.plot([0, 1], [0, 1], 'k--', label='Baseline')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.xlim([-0.02, 1])
# plt.ylim([0, 1.02])
# plt.legend(loc="lower right")
# plt.savefig('shared/roccurves-cloud.png')

# plt.plot(fpr0, tpr0, label='ROC curve Edge-m1')
# plt.plot(fpr1, tpr1, label='ROC curve Edge-m2')
# plt.plot(fpr2, tpr2, label='ROC curve Edge-m3')
# plt.plot([0, 1], [0, 1], 'k--', label='Baseline')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.xlim([-0.02, 1])
# plt.ylim([0, 1.02])
# plt.legend(loc="lower right")
# plt.savefig('shared/roccurves-edge.png')