In [1]:
#import important library
import numpy as np
import pandas as pd

import io #Importing input module

from sklearn.model_selection import train_test_split
from sklearn import linear_model  # Importing linear model

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches


#import model library
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#import dataset

from google.colab import files
uploaded = files.upload()

import io
data = pd.read_csv(io.BytesIO(uploaded['biodeg.csv']), sep=';')
data

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01[N-N],F04[C-N],NssssC,nCb-,C%,nCp,nO,...,C-026,F02[C-N],nHDon,SpMax_B(m),Psi_i_A,nN,SM6_B(m),nArCOOR,nX,experimental_class
0,3.919,2.6909,0,0,0,0,0,31.4,2,0,...,0,0,0,2.949,1.591,0,7.253,0,0,RB
1,4.170,2.1144,0,0,0,0,0,30.8,1,1,...,0,0,0,3.315,1.967,0,7.257,0,0,RB
2,3.932,3.2512,0,0,0,0,0,26.7,2,4,...,0,0,1,3.076,2.417,0,7.601,0,0,RB
3,3.000,2.7098,0,0,0,0,0,20.0,0,2,...,0,0,1,3.046,5.000,0,6.690,0,0,RB
4,4.236,3.3944,0,0,0,0,0,29.4,2,4,...,0,0,0,3.351,2.405,0,8.003,0,0,RB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,5.431,2.8955,0,0,0,2,0,32.1,4,1,...,0,6,1,3.573,2.242,1,8.088,0,0,NRB
1051,5.287,3.3732,0,0,9,0,0,35.3,0,9,...,0,3,0,3.787,3.083,3,9.278,0,0,NRB
1052,4.869,1.7670,0,1,9,0,5,44.4,0,4,...,4,13,0,3.848,2.576,5,9.537,1,0,NRB
1053,5.158,1.6914,2,0,36,0,9,56.1,0,0,...,1,16,0,5.808,2.055,8,11.055,0,1,NRB


**Data processing**

In [3]:
#Get binary classification of the response 
response = np.unique(data['experimental_class'])
print('Target variables  : ', response)

Target variables  :  ['NRB' 'RB']


In [4]:
#convert the binary classification clasess into numerical values 
#so that it can be plotted 
#'RB' is equal to value 1 (true)
#'NRB' is equal to value 0 (false)

def y_to_numeric(y):
  y_test_list = []
  for i, value in enumerate(y):
    if (value == response[1]):
      y_test_list.append(1)
    else:
      y_test_list.append(0)

  return np.array(y_test_list)

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

#Extract features and response into respective y and X variable
y = data['experimental_class'].values
y = y_to_numeric(y)
X = data[data.keys()[:-1]]

#Normalize data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

**Features Selection**

In [6]:
# Import the feature selection library
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

#apply SelectKBest class to extract best features from the dataset
#ANOVA is used as the feature selection method
#the lower p values acquired, the stronger the relationship with the response
#the higher score acquired, the stronger the relationship with the response
bestfeatures = SelectKBest(score_func=f_classif)
fit = bestfeatures.fit(X,y)

dfpvalues = pd.DataFrame(fit.pvalues_)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(data.keys()[:-1])

featureScores = pd.concat([dfcolumns,dfpvalues,dfscores],axis=1)
featureScores.columns = ['Features','P Value', 'Score']  #naming the dataframe columns

pvalueTop = featureScores.nsmallest(featureScores.shape[0],'P Value')

#print the p value for each features in ascending order
#the first on the list has the strongest correlation with the target response
print ("Features selection using ANOVA method:")
print ("--------------------------------------")
print(pvalueTop) 

Features selection using ANOVA method:
--------------------------------------
       Features       P Value       Score
0       SpMax_L  5.741990e-41  195.999690
26      SpMax_A  1.205494e-39  188.835426
21  SpPosA_B(p)  5.126240e-36  169.390245
38     SM6_B(m)  6.093413e-35  163.690168
12    HyWi_B(m)  1.238373e-30  141.126544
14        SM6_L  1.462180e-30  140.752426
6          nCb-  1.769975e-29  135.151782
32        C-026  2.619938e-26  118.916644
2           nHM  3.022542e-23  103.463027
35   SpMax_B(m)  7.808043e-22   96.411675
13          LOC  8.342953e-20   86.365580
33     F02[C-N]  6.274198e-19   82.057578
37           nN  5.480339e-18   77.450978
24    B03[C-Cl]  9.297153e-17   71.466604
10     F03[C-N]  1.453584e-15   65.691318
4      F04[C-N]  1.167523e-14   61.339528
40           nX  1.920032e-12   50.773587
7            C%  3.884058e-11   44.611199
9            nO  6.867642e-09   34.129262
30        TI2_L  1.393576e-08   32.709270
5        NssssC  2.538854e-08   31.50832

In [7]:
# A feature is deemed significant if the p value acquired is less than 0.05
# The bottom 7 features consists of p value that is higher than 0.5
# So they are considered to have weak relationship with the target response
weak_features = 7

# Only select features with p value that is lower than 0.05
# To ensure only significant features used for model training
# Store best features in array for later us
featuresBest = pvalueTop['Features'][:-weak_features or None] #Assign

In [8]:
# Splitting the dataset into train and test set 
X = data[featuresBest].values
seed_num = 10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed_num)

print(X_train.shape)
print(X_test.shape)

(844, 34)
(211, 34)


**Logistic Regression**

In [9]:
# Importing libraries for logistic regression
import numpy as np
from sklearn import linear_model, decomposition
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [10]:
# Creating objects required for pipelining
std_slc = StandardScaler()
pca = decomposition.PCA()
logistic_Reg = linear_model.LogisticRegression(max_iter=2000, solver='liblinear')

In [11]:
# Assigning all three objects to pipe
pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                        ('logistic_Reg', logistic_Reg)])
C = np.logspace(-4, 4, 50)
penalty = ['l1', 'l2']
parameters = dict(logistic_Reg__C=C,
                  logistic_Reg__penalty=penalty)

In [13]:
# Displaying flow of GridSearchCV
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X_train, y_train)

In [14]:
# Displaying best penalty, best C and other parameters of logistic regression
Best_Penalty = clf_GS.best_estimator_.get_params()['logistic_Reg__penalty']
Best_C = clf_GS.best_estimator_.get_params()['logistic_Reg__C']
print('Best Penalty:', Best_Penalty )
print('Best C:', Best_C )
print(); print(clf_GS.best_estimator_.get_params()['logistic_Reg'])

Best Penalty: l1
Best C: 2.559547922699533

LogisticRegression(C=2.559547922699533, max_iter=2000, penalty='l1',
                   solver='liblinear')


In [15]:
# Assigning linear_model.LogisticRegression to model
model = linear_model.LogisticRegression(penalty=Best_Penalty, C=Best_C, max_iter=2000, solver='liblinear' )

In [16]:
# Displaying the model of logistic regression
model.fit(X_train, y_train)

In [17]:
# Accuracy, confusion matrix and classification report of logistic regression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_pred = model.predict(X_test)
print("Accuracy:")
print(accuracy_score(y_test, y_pred))
print(" ")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(" ")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy:
0.8767772511848341
 
Confusion Matrix:
[[135  12]
 [ 14  50]]
 
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.91       147
           1       0.81      0.78      0.79        64

    accuracy                           0.88       211
   macro avg       0.86      0.85      0.85       211
weighted avg       0.88      0.88      0.88       211



**Neural Network**

In [20]:
!pip install scikeras



In [27]:
# import library for NN model
import tensorflow as tf
import keras

from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier

ModuleNotFoundError: No module named 'tensorflow'

In [23]:
# set default values
input_size = featuresBest.shape[0]
def_optimizer = 'adam'
def_init = 'uniform'

# Create a function that creates the model (required for KerasClassifier) 
# while accepting the hyperparameters we want to tune 
# we also pass some default values such as optimizer='adam'
def create_model (optimizer=def_optimizer, init_mode=def_init):
    model = Sequential()
    model.add(Dense(12, input_shape=(input_size,), kernel_initializer=init_mode, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
     # compile model
    model.compile(loss='binary_crossentropy', 
                  optimizer=optimizer, 
                  metrics=['accuracy'])
    return model

**Hyperparameters Optimization**

In [None]:
# Neural network model is influenced by a few hyperparameters
# that would affect the the overall performance of the model
# Optimization by this part of coding will focus on the following hyperparameters:
# batch sizes, epochs, optimizer algorithm and initialization mode

# Optimization is done using grid search method

In [None]:
%%time
# Optimize batch size and epoch
print ("The optimization process has been executed.")
print ("Please wait for a while ...")
# create model
model = KerasClassifier(model=create_model, verbose=0)
# define the grid search parameters
batch_size = [32, 64, 128]
epochs = [20, 50, 100, 200]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

best_batch, best_epoch = grid_result.best_params_['batch_size'], grid_result.best_params_['epochs']

In [25]:
%%time
#Optimize optimisation algorithm
print ("The optimization process has been executed.")
print ("Please wait for a while ...")
# create model
model = KerasClassifier(model=create_model, epochs=best_epoch, batch_size=best_batch, verbose=0)
# define the grid search parameters
optimizer = ['sgd', 'rmsprop', 'adam', 'adamax']
param_grid = dict(model__optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("\nBest: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

best_optimizer = grid_result.best_params_['model__optimizer']

The optimization process has been executed.
Please wait for a while ...


NameError: name 'KerasClassifier' is not defined

In [26]:
%%time
#Optimize init mode
print ("The optimization process has been executed.")
print ("Please wait for a while ...")
# create model
model = KerasClassifier(model=create_model, epochs=best_epoch, batch_size=best_batch, optimizer=best_optimizer, verbose=0)
# define the grid search parameters
init_mode = ['uniform', 'lecun_uniform', 'glorot_uniform', 'he_uniform']
param_grid = dict(model__init_mode=init_mode)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

best_init = grid_result.best_params_['model__init_mode']

The optimization process has been executed.
Please wait for a while ...


NameError: name 'KerasClassifier' is not defined

In [None]:
# Display
print ("Optimized parameters:")
print ("---------------------")
print ("Init mode: ", best_init)
print ("Optimizer: ", best_optimizer)
print ("Batch sizes: ", best_batch)
print ("Epochs: ", best_epoch)


**Model Training**

In [None]:
# Function to further optimize the value of epoch 
# Best epoch is chosen so that it minimize the validation loss
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [24]:
# Train model using the optimal hyperparameters generated
model = create_model(optimizer=best_optimizer, init_mode=best_init)
model_history = model.fit(X_train, y_train, epochs=best_epoch, batch_size=best_batch, callbacks=[stop_early], validation_split=0.25)

optimize_epoch = len(model_history.history['val_loss'])
print('Optimize epoch:', optimize_epoch)

NameError: name 'best_optimizer' is not defined

In [24]:
# Display history of training, validation accuracy and loss for each epoch
# Best epoch is chosen so that it minimize the validation loss
score = model.evaluate(X_train, y_train, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

fig, ax = plt.subplots(1, 1, figsize=(10,6))
ax.plot(np.sqrt(model_history.history['accuracy']), 'r', label='train_acc')
ax.plot(np.sqrt(model_history.history['val_accuracy']), 'b' ,label='validation')
ax.set_xlabel(r'Epoch', fontsize=20)
ax.set_ylabel(r'Accuracy', fontsize=20)
ax.legend()
ax.tick_params(labelsize=20)

fig, ax = plt.subplots(1, 1, figsize=(10,6))
ax.plot(np.sqrt(model_history.history['loss']), 'r', label='train')
ax.plot(np.sqrt(model_history.history['val_loss']), 'b' ,label='validation')
ax.set_xlabel(r'Epoch', fontsize=20)
ax.set_ylabel(r'Loss', fontsize=20)
ax.legend()
ax.tick_params(labelsize=20)

AttributeError: 'LogisticRegression' object has no attribute 'evaluate'

In [25]:
# retrain model based on the best epoch 
model_history = model.fit(X_train, y_train, epochs=optimize_epoch, batch_size=best_batch, callbacks=[stop_early], validation_split=0.25)


NameError: name 'optimize_epoch' is not defined

In [None]:
# Evaluate the keras model for training dataset
_, accuracy = model.evaluate(X_train, y_train)
accuracy = round((accuracy*100),2)
print('Accuracy:', accuracy)

Accuracy: 82.58


In [None]:
# Evaluate the keras model for test dataset
_, accuracy = model.evaluate(X_test, y_test)
accuracy = round((accuracy*100),2)
print('Accuracy:', accuracy)

Accuracy: 87.68


In [None]:
# Dictionary to translate binary into original categorical class
convert_to_binary  = {'1': 'RB', '0': 'NRB'}
  

In [None]:
# make class predictions with the model
# display predictions in original categorical class
y_pred = (model.predict(X_test) > 0.5).astype(int)

print ("Value prediction:")
print("------------------")
for i in range(X_test.shape[0]):
	print('Record',[i+1],'=>', convert_to_binary[str(y_pred[i][0])], '(expected ', convert_to_binary[str(y_test[i])],')')

Value prediction:
------------------
Record [1] => NRB (expected  RB )
Record [2] => NRB (expected  NRB )
Record [3] => NRB (expected  NRB )
Record [4] => NRB (expected  NRB )
Record [5] => NRB (expected  NRB )
Record [6] => NRB (expected  NRB )
Record [7] => NRB (expected  NRB )
Record [8] => NRB (expected  NRB )
Record [9] => NRB (expected  NRB )
Record [10] => RB (expected  RB )
Record [11] => RB (expected  RB )
Record [12] => NRB (expected  NRB )
Record [13] => RB (expected  NRB )
Record [14] => RB (expected  RB )
Record [15] => RB (expected  RB )
Record [16] => NRB (expected  NRB )
Record [17] => RB (expected  RB )
Record [18] => NRB (expected  NRB )
Record [19] => NRB (expected  RB )
Record [20] => NRB (expected  NRB )
Record [21] => RB (expected  NRB )
Record [22] => RB (expected  NRB )
Record [23] => NRB (expected  NRB )
Record [24] => NRB (expected  NRB )
Record [25] => RB (expected  NRB )
Record [26] => RB (expected  RB )
Record [27] => RB (expected  RB )
Record [28] => RB (e

In [None]:
# print("Analysis Neural Network:")
# print("------------------------------------------------")

#Neural Network Analysis
print("Accuracy: ") 
print("---------")
print(accuracy_score(y_test, y_pred))

print("\n\nConfusion Matric:")
print("-------------------")
print(confusion_matrix(y_test, y_pred))

print("\n\nClassification Report: ")
print("----------------------")
print(classification_report(y_test, y_pred))


Accuracy: 
---------
0.8767772511848341


Confusion Matric:
-------------------
[[132  15]
 [ 11  53]]


Classification Report: 
----------------------
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       147
           1       0.78      0.83      0.80        64

    accuracy                           0.88       211
   macro avg       0.85      0.86      0.86       211
weighted avg       0.88      0.88      0.88       211

