# Utility

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import backend as K

Using TensorFlow backend.


In [2]:
class utility:

    def read_CSV(self, filename):
        df = pd.read_csv(filename)
        return df

    def get_text_label(self, df):
        texts = []  # list of text samples
        labels = []  # list of label ids
        for index, row in df.iterrows():
            if isinstance(row['text_cleaned'], float):
                texts.append(str(row['text_cleaned']))
            else:
                texts.append(row['text_cleaned'])

            labels.append(row['target'])

        return texts, labels

    def tokenize_texts(self, texts):
        tokenizer = Tokenizer(num_words=50000)
        tokenizer.fit_on_texts(texts)

        return tokenizer

    def padding_texts(self, texts, maxlen):

        texts = pad_sequences(texts, padding='post', maxlen=maxlen)

        return texts

    def get_metric(self, y_true, y_pred):
        accuracyScore = accuracy_score(y_true, y_pred)

        precisionScore = precision_score(y_true, y_pred)
        recallScore = recall_score(y_true, y_pred)
        f1Score = f1_score(y_true, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def print_metric(self, accuracyScore, precisionScore, recallScore, f1Score):
        print("Accuracy: {}".format(str(accuracyScore)))
        print("Precision: {}".format(str(precisionScore)))
        print("Recall: {}".format(str(recallScore)))
        print("F1-Score: {}".format(str(f1Score)))
        print("{},{},{},{}".format(str(accuracyScore), str(precisionScore), str(recallScore), str(f1Score)))
        

    def get_testing_metric(self, y_test, y_pred):
        accuracyScore, precisionScore, recallScore, f1Score = self.get_metric(y_test, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def write_df_csv(self, df, out_path):
        df.to_csv(out_path, index=False)

    def create_embedding_matrix(self, filepath, word_index, embedding_dim):
        vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
        embedding_matrix = np.zeros((vocab_size, embedding_dim))

        with open(filepath, encoding="utf8") as f:
            for line in f:
                word, *vector = line.split()
                if word in word_index:
                    idx = word_index[word]
                    embedding_matrix[idx] = np.array(
                        vector, dtype=np.float32)[:embedding_dim]

        return embedding_matrix

    def get_max_length_of_sentences(self, texts):
        maxlength = 0
        for text in texts:
            if (len(text.split()) > maxlength):
                maxlength = len(text.split())

        return maxlength

    def get_training_trial_data(self, textsTraining, labelsTraining, textsTrial, labelsTrial, glovePath):
        textsTraining, textsTesting = np.asarray(textsTraining), np.asarray(textsTrial)
        y_train, y_val = np.asarray(labelsTraining), np.asarray(labelsTrial)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)
        X_val = tokenizer.texts_to_sequences(textsTesting)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)
        X_val = self.padding_texts(X_val, maxlen)

        embedding_matrix = []
        embedding_matrix.append(self.create_embedding_matrix(glovePath[0], tokenizer.word_index, 50))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[1], tokenizer.word_index, 100))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[2], tokenizer.word_index, 200))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[3], tokenizer.word_index, 300))

        return X_train, X_val, y_train, y_val, vocab_size, maxlen, embedding_matrix

    def Average(self, list):
        return sum(list) / len(list)

    def recall(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def f1_score(self, y_true, y_pred):
        precision = self.precision(y_true, y_pred)
        recall = self.recall(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Finite State Machine

In [3]:
import random

def FSM():
    fsm = {}
    fsm[0] = {'src': 0, 'dst': 1, 'layer': 'embedding_layer', 'next_path': [1]}
    fsm[1] = {'src': 1, 'dst': 2, 'layer': 'convolutional_layer', 'next_path': [2, 4]}
    fsm[2] = {'src': 2, 'dst': 3, 'layer': 'maxpooling_layer', 'next_path': [3]}    
    fsm[3] = {'src': 3, 'dst': 2, 'layer': 'convolutional_layer', 'next_path': [2, 4]}
    fsm[4] = {'src': 2, 'dst': 4, 'layer': 'global_maxpooling_layer', 'next_path': [5]}
    fsm[5] = {'src': 4, 'dst': 5, 'layer': 'dense_layer', 'next_path': [6, 7]}
    fsm[6] = {'src': 5, 'dst': 5, 'layer': 'dense_layer', 'next_path': [6, 7]}    
    fsm[7] = {'src': 5, 'dst': 6, 'layer': 'dropout_layer', 'next_path': [8]}
    fsm[8] = {'src': 6, 'dst': 7, 'layer': 'output_layer', 'next_path': []}

    return fsm

def addConvLayer(idx, toolbox, toolboxes, defaultVal, layerparameters):
    toolbox.register('num_filters{}'.format(str(idx)), layerparameters['num_filters'][0],
                     layerparameters['num_filters'][1], layerparameters['num_filters'][2])
    
    toolboxes.append(toolbox.__getattribute__('num_filters{}'.format(str(idx))))
    toolbox.register('kernel_size{}'.format(str(idx)), layerparameters['kernel_size'][0],
                     layerparameters['kernel_size'][1], layerparameters['kernel_size'][2])
    toolboxes.append(toolbox.__getattribute__('kernel_size{}'.format(str(idx))))
    toolbox.register('conv_activation_func{}'.format(str(idx)), layerparameters['conv_activation_func'][0],
                     layerparameters['conv_activation_func'][1])
    toolboxes.append(toolbox.__getattribute__('conv_activation_func{}'.format(str(idx))))
    toolbox.register('conv_init_mode{}'.format(str(idx)), layerparameters['conv_init_mode'][0],
                     layerparameters['conv_init_mode'][1])
    toolboxes.append(toolbox.__getattribute__('conv_init_mode{}'.format(str(idx))))
    toolbox.register('conv_weight_constraint{}'.format(str(idx)), layerparameters['conv_weight_constraint'][0],
                     layerparameters['conv_weight_constraint'][1], layerparameters['conv_weight_constraint'][2])
    toolboxes.append(toolbox.__getattribute__('conv_weight_constraint{}'.format(str(idx))))

    defaultVal.update({'num_filters{}'.format(str(idx)): 64})
    defaultVal.update({'kernel_size{}'.format(str(idx)): 3})
    defaultVal.update({'conv_activation_func{}'.format(str(idx)): "relu"})
    defaultVal.update({'conv_init_mode{}'.format(str(idx)): "glorot_uniform"})
    defaultVal.update({'conv_weight_constraint{}'.format(str(idx)): 3})


def addDenseLayer(idx, toolbox, toolboxes, defaultVal, layerparameters):
    toolbox.register('neurons{}'.format(str(idx)), layerparameters['neurons'][0],
                     layerparameters['neurons'][1], layerparameters['neurons'][2])
    toolboxes.append(toolbox.__getattribute__('neurons{}'.format(str(idx))))
    toolbox.register('dense_activation_func{}'.format(str(idx)), layerparameters['dense_activation_func'][0],
                     layerparameters['dense_activation_func'][1])
    toolboxes.append(toolbox.__getattribute__('dense_activation_func{}'.format(str(idx))))
    toolbox.register('dense_init_mode{}'.format(str(idx)), layerparameters['dense_init_mode'][0],
                     layerparameters['dense_init_mode'][1])
    toolboxes.append(toolbox.__getattribute__('dense_init_mode{}'.format(str(idx))))
    toolbox.register('dense_weight_constraint{}'.format(str(idx)), layerparameters['dense_weight_constraint'][0],
                     layerparameters['dense_weight_constraint'][1], layerparameters['dense_weight_constraint'][2])
    toolboxes.append(toolbox.__getattribute__('dense_weight_constraint{}'.format(str(idx))))

    defaultVal.update({'neurons{}'.format(str(idx)): 1})
    defaultVal.update({'dense_activation_func{}'.format(str(idx)): "relu"})
    defaultVal.update({'dense_init_mode{}'.format(str(idx)): "glorot_uniform"})
    defaultVal.update({'dense_weight_constraint{}'.format(str(idx)): 3})


def addMaxPoolingLayer(idx, toolbox, toolboxes, defaultVal, layerparameters):
    toolbox.register('pool_size{}'.format(str(idx)), layerparameters['pool_size'][0],
                     layerparameters['pool_size'][1])
    toolboxes.append(toolbox.__getattribute__('pool_size{}'.format(str(idx))))

    defaultVal.update({'pool_size{}'.format(str(idx)): 5})


def addDropoutLayer(idx, toolbox, toolboxes, defaultVal, layerparameters):
    toolbox.register('dropout_rate{}'.format(str(idx)), layerparameters['dropout_rate'][0],
                     layerparameters['dropout_rate'][1])
    toolboxes.append(toolbox.__getattribute__('dropout_rate{}'.format(str(idx))))

    defaultVal.update({'dropout_rate{}'.format(str(idx)): 0.2})

def getLayerSize(layer, conv_idx, dense_idx, dropout_idx, maxpooling_idx):
    if layer == 'convolutional_layer':
        conv_idx += 1
    elif layer == 'dense_layer':
        dense_idx += 1
    elif layer == 'dropout_layer':
        dropout_idx += 1
    elif layer == 'maxpooling_layer':
        maxpooling_idx += 1
    return conv_idx, dense_idx, dropout_idx, maxpooling_idx


def getMaxLayerSize(conv_idx, dense_idx, dropout_idx, maxpooling_idx, max_conv_idx, max_dense_idx, max_dropout_idx,
                    max_maxpooling_idx):
    if conv_idx > max_conv_idx:
        max_conv_idx = conv_idx
    if dense_idx > max_dense_idx:
        max_dense_idx = dense_idx
    if dropout_idx > max_dropout_idx:
        max_dropout_idx = dropout_idx
    if maxpooling_idx > max_maxpooling_idx:
        max_maxpooling_idx = maxpooling_idx

    return max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx


def addLayerToolboxes(max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx, toolbox, toolboxes, defaultVal,
                      layerparameters):
    idx = 0
    while idx < max_conv_idx:
        idx += 1
        addConvLayer(idx, toolbox, toolboxes, defaultVal, layerparameters)

    idx = 0
    while idx < max_dense_idx:
        idx += 1
        addDenseLayer(idx, toolbox, toolboxes, defaultVal, layerparameters)

    idx = 0
    while idx < max_maxpooling_idx:
        idx += 1
        addMaxPoolingLayer(idx, toolbox, toolboxes, defaultVal, layerparameters)

    idx = 0
    while idx < max_dropout_idx:
        idx += 1
        addDropoutLayer(idx, toolbox, toolboxes, defaultVal, layerparameters)


def generateFSM(n_pop, layerparameters, toolbox, toolboxes, defaultVal):
    fsm = FSM()

    path_ind = {}
    max_conv_idx = 0
    max_dense_idx = 0
    max_dropout_idx = 0
    max_maxpooling_idx = 0

    for ind in range(0, n_pop):
        idx = conv_idx = dense_idx = dropout_idx = maxpooling_idx = 0
        path = [fsm[idx]['layer']]
        while len(fsm[idx]['next_path']) != 0:
            idx = random.choice(fsm[idx]['next_path'])
            layer = fsm[idx]['layer']
            path.append(layer)
            conv_idx, dense_idx, dropout_idx, maxpooling_idx = getLayerSize(layer, conv_idx, dense_idx, dropout_idx,
                                                                            maxpooling_idx)

        max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx = getMaxLayerSize(conv_idx, dense_idx,
                                                                                           dropout_idx, maxpooling_idx,
                                                                                           max_conv_idx, max_dense_idx,
                                                                                           max_dropout_idx,
                                                                                           max_maxpooling_idx)

        path_ind[ind] = path

    addLayerToolboxes(max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx, toolbox, toolboxes, defaultVal,
                      layerparameters)

    return path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx


def openFSM(df, layerparameters, toolbox, toolboxes, defaultVal):
    path_ind = {}
    fitnesses = []

    hyperparams = [s for s in list(df.columns) if not 'Unnamed' in s]

    max_conv_idx = sum('num_filters' in s for s in hyperparams)
    max_dense_idx = sum('neurons' in s for s in hyperparams)
    max_dropout_idx = sum('dropout_rate' in s for s in hyperparams)
    max_maxpooling_idx = sum('pool_size' in s for s in hyperparams)

    for index, row in df.iterrows():
        path = [s for s in row if 'layer' in str(s)]
        fitness = [s for s in row if str(s).replace('.', '', 1).isdigit()]
        fitnesses.append(tuple([float(fitness[0])]))
        path_ind[index] = path

    addLayerToolboxes(max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx, toolbox, toolboxes, defaultVal,
                      layerparameters)

    return path_ind, fitnesses, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx

# CNN

In [4]:
import tensorflow as tf

class CNN:

    def cnn_model(self, vocab_size, maxlen, embedding_matrix, indiv, path):
        model = tf.keras.models.Sequential()
        conv_idx = dense_idx = dropout_idx = maxpooling_idx = 0
        for layer in path:
            if layer == 'embedding_layer':
                model.add(
                    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=indiv['output_dim'],
                                     weights=[embedding_matrix], input_length=maxlen, trainable=True))
            elif layer == 'convolutional_layer':
                conv_idx += 1
                model.add(tf.keras.layers.Conv1D(indiv['num_filters{}'.format(str(conv_idx))], 
                                                 indiv['kernel_size{}'.format(str(conv_idx))],
                                        kernel_initializer=indiv['conv_init_mode{}'.format(str(conv_idx))],
                                        activation=indiv['conv_activation_func{}'.format(str(conv_idx))],
                                        kernel_constraint=tf.keras.constraints.max_norm(indiv['conv_weight_constraint{}'.format(str(conv_idx))]),
                                        data_format='channels_first'))
            elif layer == 'dense_layer':
                dense_idx += 1
                model.add(tf.keras.layers.Dense(indiv['neurons{}'.format(str(dense_idx))],
                                       kernel_initializer=indiv['dense_init_mode{}'.format(str(dense_idx))],
                                       activation=indiv['dense_activation_func{}'.format(str(dense_idx))],
                                       kernel_constraint=tf.keras.constraints.max_norm(indiv['dense_weight_constraint{}'.format(str(dense_idx))])))
            elif layer == 'dropout_layer':
                dropout_idx += 1
                model.add(tf.keras.layers.Dropout(indiv['dropout_rate{}'.format(str(dropout_idx))]))
            elif layer == 'maxpooling_layer':
                maxpooling_idx += 1
                model.add(tf.keras.layers.MaxPooling1D(indiv['pool_size{}'.format(str(maxpooling_idx))]))
            elif layer == 'global_maxpooling_layer':
                model.add(tf.keras.layers.GlobalMaxPooling1D())
            elif layer == 'output_layer':
                model.add(tf.keras.layers.Dense(1, kernel_initializer=indiv['output_init_mode'], activation='sigmoid'))

        if indiv['optimizer'] == 'sgd':
            opt = tf.keras.optimizers.SGD(lr=indiv['learning_rate'], momentum=indiv['momentum'], decay=0.0,
                                 nesterov=False)
        elif indiv['optimizer'] == 'rmsprop':
            opt = tf.keras.optimizers.RMSprop(lr=indiv['learning_rate'], rho=0.9, epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adagrad':
            opt = tf.keras.optimizers.Adagrad(lr=indiv['learning_rate'], epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adadelta':
            opt = tf.keras.optimizers.Adadelta(lr=indiv['learning_rate'], rho=0.95, epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adam':
            opt = tf.keras.optimizers.Adam(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                  decay=0.0, amsgrad=False)
        elif indiv['optimizer'] == 'adamax':
            opt = tf.keras.optimizers.Adamax(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                    decay=0.0)
        elif indiv['optimizer'] == 'nadam':
            opt = tf.keras.optimizers.Nadam(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                   schedule_decay=0.004)
        
        util = utility()
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[util.f1_score])

        return model

# Fitness Calculation

In [5]:
import collections
import os
from time import sleep
import gc

util = utility()
cnn = CNN()

def FitnessCalculation(individual, cfold, defaultVal, resultsPath, testing_name):
    indiv = collections.OrderedDict()
    i = 0
    for key in defaultVal.keys():
        indiv[key] = individual[i]
        i += 1

    path = individual[len(defaultVal):len(individual)]
    
    return crossfold(indiv, path, cfold, resultsPath, testing_name)


def crossfold(indiv, path, fold, resultsPath, testing_name):
    if indiv['output_dim'] == 50:
        embedding_mtx = fold['embedding_matrix'][0]
    elif indiv['output_dim'] == 100:
        embedding_mtx = fold['embedding_matrix'][1]
    elif indiv['output_dim'] == 200:
        embedding_mtx = fold['embedding_matrix'][2]
    elif indiv['output_dim'] == 300:
        embedding_mtx = fold['embedding_matrix'][3]

    model = cnn.cnn_model(fold['vocab_size'], fold['maxlen'], embedding_mtx,
                          indiv, path)
    
    #early stopping
    #save the best model
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', mode='max', verbose=False, patience=10), 
                 tf.keras.callbacks.ModelCheckpoint("{}{}.h5".format(resultsPath, testing_name), monitor='val_f1_score', mode='max', verbose=False, 
                                  save_best_only=True)]

    model.fit(fold['X_train'], fold['y_train'], epochs=indiv['epochs'], verbose=False, 
              validation_data=(fold['X_val'], fold['y_val']), use_multiprocessing=False,
              batch_size=indiv['batch_size'], callbacks=callbacks)
    
    dependencies = {
    'f1_score': util.f1_score
    }

    # load the saved model
    for x in range(0, 10):  # try 4 times
        try:
            # msg.send()
            saved_model = tf.keras.models.load_model("{}{}.h5".format(resultsPath, testing_name), custom_objects=dependencies)
            str_error = None
        except Exception as e:
            print('An error occurs when loading saved model.')
            str_error = e
            pass

        if str_error:
            sleep(5)  # wait for 2 seconds before trying to fetch the data again
        else:
            break
    

    y_pred = saved_model.predict_classes(fold['X_val'])

    os.remove("{}{}.h5".format(resultsPath, testing_name))
    del embedding_mtx
    gc.collect()

    # CNN metrics
    accuracyScore, precisionScore, recallScore, f1Score = util.get_testing_metric(fold['y_val'], y_pred)
    return f1Score


# Genetic Algorithm

In [6]:
import random
from operator import attrgetter
from deap import base
from deap import creator
from deap import tools
import time
import datetime
import math
from scipy.spatial import distance
import itertools


class GeneticAlgorithm:
    __slots__ = (
        "toolbox", "toolboxes", "cross_rate", "mut_rate", "n_pop", "n_gen", "resultsPath", "testing_name", "cfold",
        "globalparameters", "layerparameters", "defaultVal", "path_ind", "max_conv_idx", "max_maxpooling_idx",
        "max_dense_idx", "max_dropout_idx")

    def __init__(self, toolbox, toolboxes, cross_rate, mut_rate, n_pop, n_gen, resultsPath, testing_name,
                 cfold, globalparameters, layerparameters, defaultVal, path_ind, max_conv_idx, max_maxpooling_idx,
                 max_dense_idx, max_dropout_idx):
        self.toolbox = toolbox
        self.toolboxes = toolboxes
        self.cross_rate = cross_rate
        self.mut_rate = mut_rate
        self.n_pop = n_pop
        self.n_gen = n_gen
        self.resultsPath = resultsPath
        self.testing_name = testing_name
        self.cfold = cfold
        self.globalparameters = globalparameters
        self.layerparameters = layerparameters
        self.defaultVal = defaultVal
        self.path_ind = path_ind
        self.max_conv_idx = max_conv_idx
        self.max_maxpooling_idx = max_maxpooling_idx
        self.max_dense_idx = max_dense_idx
        self.max_dropout_idx = max_dropout_idx

    def fitnessCalc(self, individual):
        i = 0
        if len(individual.fitness.values) == 0:
            if (0 in individual or '' in individual or 'False' in individual or None in individual):
                for param in self.defaultVal:
                    if individual[i] == 0 or individual[i] == '' or individual[i] == 'False' or individual[i] == None:
                        individual[i] = self.defaultVal[param]
                    i += 1

            fc = FitnessCalculation(individual, self.cfold, self.defaultVal, self.resultsPath, self.testing_name)
        else:
            fc = individual.fitness.values[0]
        print('{} {}'.format(datetime.datetime.now(), fc))
        return fc,

    def write_result(self):
        # Create Testing Results
        f = open("{}{}.csv".format(self.resultsPath, self.testing_name), "a+")
        text = "i,min,max,mean,std,avgdistance,time,CR,MR"
        for param in self.defaultVal:
            text = "{},{}".format(text, param)
        text = "{}\n".format(text)
        f.write(text)
        f.close()

        # Create Last Population file
        f = open("{}{}lastpop.csv".format(self.resultsPath, self.testing_name), 'a+')
        text = "i,f1score"
        for param in self.defaultVal:
            text = "{},{}".format(text, param)
        text = "{}\n".format(text)
        f.write(text)
        f.close()

    def std_calc(self, fits, length):
        mean = sum(fits) / length
        sum2 = sum(x * x for x in fits)
        std = abs(sum2 / length - mean ** 2) ** 0.5

        return mean, std
    
    def distance_calc(self, pop):
        distances = []
        for subset in itertools.combinations(pop, 2):
            distances.append(distance.hamming(subset[0][0:subset[0].index('embedding_layer')],
                                              subset[1][0:subset[1].index('embedding_layer')]))

        avgDistance = sum(distances) / len(distances)
        
        return avgDistance

    def invalid_fitness_calc(self, pop):
        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in pop if not ind.fitness.valid]
        fitnesses = map(self.toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

    def mutHyperparam(self, individual, indpb):
        toolboxesSize = len(self.toolboxes)
        fsm = FSM()

        # Mutation for the Hyperparameter Chromosomes
        for i in range(toolboxesSize):
            if random.random() < indpb:
                if len(self.toolboxes[i].args) == 1:
                    individual[i] = self.toolboxes[i].func(self.toolboxes[i].args[0])
                else:
                    individual[i] = self.toolboxes[i].func(self.toolboxes[i].args[0], self.toolboxes[i].args[1])

        # Mutation for the Architecture Chromosomes
        archChrom = individual[individual.index('convolutional_layer'):individual.index('output_layer')]
        size = len(archChrom)

        for i in range(1, size):
            if random.random() < indpb:
                if (i>=size):
                    break
                                
                if (archChrom[i] == 'global_maxpooling_layer' or archChrom[i] == 'maxpooling_layer' or archChrom[i] == 'dropout_layer'):
                    continue

                selectMutType = random.randint(0, 1)
                # Remove the layer
                if selectMutType == 0:
#                     print('individual before remove', individual)
                        if (archChrom[i] == 'convolutional_layer') and (archChrom[i+1] == 'maxpooling_layer'):
                            archChrom.remove(archChrom[i])
                            archChrom.remove(archChrom[i])
                            size -= 2
                        elif (archChrom[i] == 'dense_layer'):
                            archChrom.remove(archChrom[i])
                            size -= 1

                # Add a layer
                elif selectMutType == 1:
#                     print('individual before add', individual)
                    if (archChrom[i] == 'convolutional_layer' and 
                        individual.count('convolutional_layer') < self.max_conv_idx):                    
                        archChrom.insert(i, 'convolutional_layer')
                        archChrom.insert(i+1, 'maxpooling_layer')
                    elif (archChrom[i] == 'dense_layer' and 
                        individual.count('dense_layer') < self.max_dense_idx):
                        archChrom.insert(i, 'dense_layer')

                individual[individual.index('convolutional_layer'):individual.index('output_layer')] = archChrom
#                 print('individual after', individual)
        return individual,

    def cxTwoPoint(self, ind1, ind2, pop, offspring):
        # Crossover for hyperparameter chromosomes
        size = ind1.index('embedding_layer')
        selectCxType = random.randint(0, 2)
        # One point crossover
        if selectCxType == 0:
#             print('ind1 before one-point crossover:', ind1)
#             print('ind2 before one-point crossover:', ind2)
            cxpoint = random.randint(1, size - 1)
            ind1[cxpoint:], ind2[cxpoint:] = ind2[cxpoint:], ind1[cxpoint:]
#             print('ind1 after one-point crossover:', ind1)
#             print('ind2 after one-point crossover:', ind2)
        # Two-point crossover
        elif selectCxType == 1:
#             print('ind1 before two-point crossover:', ind1)
#             print('ind2 before two-point crossover:', ind2)
            cxpoint1 = random.randint(1, size - 1)
            cxpoint2 = random.randint(1, size - 1)
            if cxpoint2 >= cxpoint1:
                cxpoint2 += 1
            else:  # Swap the two cx points
                cxpoint1, cxpoint2 = cxpoint2, cxpoint1

            ind1[cxpoint1:cxpoint2], ind2[cxpoint1:cxpoint2] \
                = ind2[cxpoint1:cxpoint2], ind1[cxpoint1:cxpoint2]
#             print('ind1 after two-point crossover:', ind1)
#             print('ind2 after two-point crossover:', ind2)
        # Uniform crossover
        elif selectCxType == 2:
#             print('ind1 before uniform crossover:', ind1)
#             print('ind2 before uniform crossover:', ind2)
            for i in range(size):
                if random.random() < self.cross_rate:
                    ind1[i], ind2[i] = ind2[i], ind1[i]
#             print('ind1 after uniform crossover:', ind1)
#             print('ind2 after uniform crossover:', ind2)

        # Crossover for architecture chromosomes
        # One-cut point crossover from the Global MaxPooling layer
        cxpoint1 = ind1.index('global_maxpooling_layer')
        cxpoint2 = ind2.index('global_maxpooling_layer')
        ind1[cxpoint1:], ind2[cxpoint2:] = ind2[cxpoint2:], ind1[cxpoint1:]

        max_drop_layer = max(ind1.count('dropout_layer'), ind2.count('dropout_layer'))
        if max_drop_layer > self.max_dropout_idx:
            idx = self.max_dropout_idx
            self.max_dropout_idx = max_drop_layer

            while idx < self.max_dropout_idx:
                idx += 1
                addDropoutLayer(idx, self.toolbox, self.toolboxes, self.defaultVal, self.layerparameters)

                for ind in pop + offspring:
                    ind.insert(size, random.uniform(0, 1))
                size += 1

                self.write_result()

        return ind1, ind2

    def runGA(self, lastPop=[], lastFitnesses=[]):
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)

        self.toolbox.register("individual", tools.initCycle, creator.Individual,
                              self.toolboxes, n=1)
        self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)
        self.toolbox.register("evaluate", self.fitnessCalc)
        self.toolbox.register("mate", self.cxTwoPoint)
        self.toolbox.register("mutate", self.mutHyperparam, indpb=self.mut_rate)
        self.toolbox.register("select", tools.selBest)

        pop = self.toolbox.population(n=self.n_pop)

        idx = 0
        for ind in pop:
            if lastPop:
                ind[:] = lastPop[idx]
            ind.extend(self.path_ind[idx])
            idx += 1
        
        if lastFitnesses:
            # Fitnesses from previous population
            fitnesses = lastFitnesses
        else:
            # Evaluate the entire population
            fitnesses = list(map(self.toolbox.evaluate, pop))

        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit

        self.write_result()
        
        
        g = 0
        while g < self.n_gen:
            then = time.time()
            g = g + 1
            print('{} {}'.format(datetime.datetime.now(), "-- Generation %i --" % g))          
            
            # Select the next generation individuals
            offspring = self.toolbox.select(pop, len(pop))
            # Clone the selected individuals
            offspring = list(map(self.toolbox.clone, offspring))

            # Apply crossover and mutation on the offspring
            for child1, child2 in zip(offspring[::2], offspring[1::2]):
                if random.random() < self.cross_rate:
                    self.toolbox.mate(child1, child2, pop, offspring)
                    del child1.fitness.values
                    del child2.fitness.values

            for mutant in offspring:
                if random.random() < self.mut_rate:
                    self.toolbox.mutate(mutant)
                    del mutant.fitness.values

            # Evaluate the individuals with an invalid fitness
            self.invalid_fitness_calc(offspring)

            pop[:] = self.toolbox.select(pop + offspring, self.n_pop)

            # Gather all the fitnesses in one list and print the stats
            fits = [ind.fitness.values[0] for ind in pop]

            length = len(pop)
            mean, std = self.std_calc(fits, length)
            avgDistance = self.distance_calc(pop)
            best = max(pop, key=attrgetter("fitness"))
            print('{} {}'.format(datetime.datetime.now(), "  Min %s" % min(fits)))
            print('{} {}'.format(datetime.datetime.now(), "  Max %s" % max(fits)))
            print('{} {}'.format(datetime.datetime.now(), "  Avg %s" % mean))
            print('{} {}'.format(datetime.datetime.now(), "  Std %s" % std))
            print('{} {}'.format(datetime.datetime.now(), "  AvgDistance %s" % avgDistance))
            print('{} {}'.format(datetime.datetime.now(), best))

            now = time.time()
            diff = now - then

            # save testing data
            f = open("{}{}.csv".format(self.resultsPath, self.testing_name), 'a')
            text = "{0},{1},{2},{3},{4},{5},{6},{7},{8}".format(g,min(fits), max(fits), mean, std, avgDistance, diff, self.cross_rate, self.mut_rate)
            for param in best:
                text = "{},{}".format(text, param)
            text = "{}\n".format(text)
            f.write(text)
            f.close()

            # save last population data
            f = open("{}{}lastpop.csv".format(self.resultsPath, self.testing_name), 'a')
            for ind in pop:
                text = "{0},{1}".format(g,ind.fitness.values[0])
                for param in ind:
                    text = "{},{}".format(text, param)
                text = "{}\n".format(text)             
                f.write(text)

            f.close()            

# Project path

In [7]:
training_path = 'trainPreprocessed.csv'
population_path = 'NewPop.csv'
root_path = '/lab/dbms/fatyanosa'
datasetPath = '{}/Dataset/Disaster Tweets/'.format(root_path)
resultsPath = '{}/Server1/Disaster Tweets/Results/'.format(root_path)
testing_name = "Experiment5_GA_CNN"
glovePath = ['{}/Glove/glove.6B.50d.txt'.format(root_path),
             '{}/Glove/glove.6B.100d.txt'.format(root_path),
             '{}/Glove/glove.6B.200d.txt'.format(root_path),
             '{}/Glove/glove.6B.300d.txt'.format(root_path)]

# Parameters

In [8]:
# crossover rate is the probability with which two individuals
cross_rate = 0.8

# mutation rate is the probability for mutating an individual
mut_rate = 0.2

# number of population
n_pop = 30

# number of generation
n_gen = 30

# Main Program

In [9]:
import random
from sklearn.model_selection import StratifiedKFold
from deap import base
import warnings; warnings.simplefilter('ignore')

if __name__ == '__main__':
    globalparameters = []
    globalparameters.append(("epochs", random.randint, 1, 100))
    globalparameters.append(("batch_size", random.randint, 32, 256))
    globalparameters.append(("optimizer", random.choice, ['sgd', 'rmsprop', 
                                                          'adagrad', 'adadelta',
                                                          'adam', 'adamax', 
                                                          'nadam']))
    globalparameters.append(("learning_rate", random.uniform, 1e-4, 1e-2))
    globalparameters.append(("momentum", random.choice, [0.9]))
    globalparameters.append(("output_init_mode", random.choice, ['glorot_uniform']))
    globalparameters.append(("output_dim", random.choice, [100]))

    layerparameters = {}
    layerparameters["num_filters"] = [random.randint, 32, 512]
    layerparameters["kernel_size"] = [random.randint, 1, 5]
    layerparameters["conv_activation_func"] = [random.choice,
                                               ['relu', 'softmax', 'elu', 'selu',
                                                'softplus', 'softsign', 'tanh',
                                                'sigmoid', 'hard_sigmoid', 'linear']]
    layerparameters["conv_init_mode"] = [random.choice,
                                         ['zeros',
                                          'ones',
                                          'uniform',
                                          'normal',
                                          'glorot_normal',
                                          'glorot_uniform',
                                          'he_normal',
                                          'he_uniform',
                                          'lecun_normal',
                                          'lecun_uniform']]
    layerparameters["conv_weight_constraint"] = [random.randint, 1, 5]
    layerparameters["neurons"] = [random.randint, 1, 30]
    layerparameters["dense_activation_func"] = [random.choice,
                                                ['relu', 'softmax', 'elu', 'selu',
                                                 'softplus', 'softsign', 'tanh',
                                                 'sigmoid', 'hard_sigmoid', 'linear']]
    layerparameters["dense_init_mode"] = [random.choice,
                                          ['zeros',
                                           'ones',
                                           'uniform',
                                           'normal',
                                           'glorot_normal',
                                           'glorot_uniform',
                                           'he_normal',
                                           'he_uniform',
                                           'lecun_normal',
                                           'lecun_uniform']]
    layerparameters["dense_weight_constraint"] = [random.randint, 1, 5]
    layerparameters["pool_size"] = [random.choice, [5]]
    layerparameters["dropout_rate"] = [random.choice, [0.2]]

    defaultVal = collections.OrderedDict([
        ("epochs", 10),
        ("batch_size", 32),
        ("optimizer", "adam"),
        ("learning_rate", 1e-4),
        ("momentum", 0.9),
        ("output_init_mode", "glorot_uniform"),
        ("output_dim", 100)]
    )
    
    # object class
    util = utility()
    toolbox = base.Toolbox()
    toolboxes = []

    # Attribute generator
    for hyper in globalparameters:
        if len(hyper) == 3:
            toolbox.register(hyper[0], hyper[1], hyper[2])
        else:
            toolbox.register(hyper[0], hyper[1], hyper[2], hyper[3])

    toolboxes.append(toolbox.epochs)
    toolboxes.append(toolbox.batch_size)
    toolboxes.append(toolbox.optimizer)
    toolboxes.append(toolbox.learning_rate)
    toolboxes.append(toolbox.momentum)
    toolboxes.append(toolbox.output_init_mode)
    toolboxes.append(toolbox.output_dim)

    path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx = generateFSM(n_pop, layerparameters,
                                                                                             toolbox, toolboxes,
                                                                                             defaultVal)

#     # Read population data
#     dfPopulation = util.read_CSV("{}{}".format(resultsPath, population_path))

#     path_ind, fitnesses, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx = openFSM(dfPopulation, layerparameters,
#                                                                                          toolbox, toolboxes, defaultVal)
#     dfPopulation = dfPopulation.drop(columns=[col for col in dfPopulation if col not in defaultVal])

#     population = dfPopulation.loc[:, ~dfPopulation.columns.str.match('Unnamed')].values.tolist()

    # Read data
    df = util.read_CSV("{}{}".format(datasetPath, training_path))

    from sklearn.model_selection import train_test_split
    dfTraining, dfTrial = train_test_split(df, test_size = 0.3, random_state=42)

    textsTraining, labelsTraining = util.get_text_label(dfTraining)
    textsTrial, labelsTrial = util.get_text_label(dfTrial)
    cfold = {}

    X_train, X_val, y_train, y_val, vocab_size, maxlen, embedding_matrix = util.get_training_trial_data(
        textsTraining, labelsTraining, textsTrial, labelsTrial, glovePath)
    cfold= {'X_train': X_train, 'X_val': X_val, 'y_train': y_train, 'y_val': y_val, 'vocab_size': vocab_size,
                  'maxlen': maxlen, 'embedding_matrix': embedding_matrix}
                  
    ga = GeneticAlgorithm(toolbox, toolboxes, cross_rate, mut_rate, n_pop, n_gen, resultsPath, testing_name,
                          cfold, globalparameters, layerparameters, defaultVal, path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx)
    ga.runGA()
#     ga.runGA(population, fitnesses)


2020-04-02 22:13:45.787215 -- Generation 12 --
2020-04-02 22:13:57.716065 0.5944615384615385
2020-04-02 22:14:11.861483 0.7844782380702674
2020-04-02 22:14:27.250902 0.7799586776859504
2020-04-02 22:14:39.250476 0.7675040085515769
2020-04-02 22:14:58.023233 0.7759446514103248
2020-04-02 22:15:16.929880 0.7816938453445555
2020-04-02 22:15:30.404475 0.778816199376947
2020-04-02 22:15:47.505629 0.7740203972088029
2020-04-02 22:16:04.466591 0.7767810026385223
2020-04-02 22:16:18.307330 0.7768421052631579
2020-04-02 22:16:32.745929 0.7814713896457766
2020-04-02 22:17:11.381464 0.124883504193849
2020-04-02 22:17:26.522848 0.7872807017543859
2020-04-02 22:17:49.302338 0.7735951991271139
2020-04-02 22:18:16.408377 0.7729211087420041
2020-04-02 22:18:36.233954 0.7619589977220956
2020-04-02 22:19:00.035287 0.7697954902988988
2020-04-02 22:19:30.438166 0.763873775843308
2020-04-02 22:19:47.733551 0.7789358200767964
2020-04-02 22:20:06.069195 0.7729124236252546
2020-04-02 22:20:23.359734 0.7781316

2020-04-02 22:42:54.314163 0.7764102564102564
2020-04-02 22:43:09.631342 0.7855297157622739
2020-04-02 22:43:24.555410 0.7832020997375327
2020-04-02 22:43:49.132077 0.7800212539851222
2020-04-02 22:44:17.585631 0.7777777777777778
2020-04-02 22:44:40.908652 0.780278670953912
2020-04-02 22:45:00.769958 0.7718550106609808
2020-04-02 22:45:15.945694 0.7749737118822293
2020-04-02 22:45:31.973933 0.7643805309734513
2020-04-02 22:45:48.432136 0.7587268993839836
2020-04-02 22:46:10.214658 0.7662125340599455
2020-04-02 22:46:42.331398 0.7544224765868887
2020-04-02 22:46:58.504563 0.7647647647647647
2020-04-02 22:47:12.538869 0.7721847431240271
2020-04-02 22:47:28.905806 0.7750677506775068
2020-04-02 22:47:44.761921 0.7654320987654322
2020-04-02 22:48:02.470420 0.7736670293797606
2020-04-02 22:48:27.457489 0.7764830508474575
2020-04-02 22:48:45.719902 0.777118186724231
2020-04-02 22:49:04.289369 0.7806691449814127
2020-04-02 22:49:24.412121 0.7831196581196581
2020-04-02 22:49:42.547003 0.7812499

2020-04-02 23:19:39.662437 0.775169535732916
2020-04-02 23:19:59.188171 0.7870419543281996
2020-04-02 23:19:59.216712   Min 0.7881355932203391
2020-04-02 23:19:59.216753   Max 0.7943336831059811
2020-04-02 23:19:59.216777   Avg 0.7893622080997692
2020-04-02 23:19:59.216797   Std 0.0020369341859720167
2020-04-02 23:19:59.216814   AvgDistance 0.45631045863207065
2020-04-02 23:19:59.216830 [57, 125, 'adamax', 0.006792884510893001, 0.9, 'glorot_uniform', 100, 300, 2, 'softplus', 'uniform', 1, 237, 5, 'selu', 'lecun_uniform', 1, 426, 5, 'softmax', 'zeros', 2, 54, 1, 'hard_sigmoid', 'glorot_uniform', 4, 228, 5, 'softplus', 'ones', 3, 335, 3, 'softmax', 'glorot_normal', 2, 442, 2, 'softmax', 'ones', 2, 230, 2, 'selu', 'glorot_normal', 3, 95, 3, 'softplus', 'lecun_uniform', 5, 106, 2, 'selu', 'ones', 4, 388, 4, 'softmax', 'uniform', 3, 50, 1, 'softmax', 'glorot_uniform', 5, 293, 1, 'selu', 'ones', 2, 4, 'sigmoid', 'lecun_normal', 2, 4, 'selu', 'glorot_normal', 2, 30, 'softsign', 'ones', 1, 9, 

2020-04-02 23:50:42.931621 0.7809425168306577
2020-04-02 23:51:05.583097 0.772256728778468
2020-04-02 23:51:33.276716 0.7785016286644952
2020-04-02 23:52:07.555803 0.7863340563991322
2020-04-02 23:52:40.070375 0.7747252747252747
2020-04-02 23:53:07.871279 0.7873015873015873
2020-04-02 23:53:41.419528 0.7833859095688748
2020-04-02 23:54:12.123155 0.7801268498942917
2020-04-02 23:54:42.346044 0.7750393287886733
2020-04-02 23:55:09.945809 0.7711632987438557
2020-04-02 23:55:38.983750 0.7703218767048554
2020-04-02 23:56:10.609146 0.7753113156469952
2020-04-02 23:56:38.991449 0.7771797631862216
2020-04-02 23:57:12.988015 0.7772304324028462
2020-04-02 23:57:33.277713 0.7759791122715405
2020-04-02 23:57:53.521179 0.7823750671681892
2020-04-02 23:58:19.154637 0.7896995708154505
2020-04-02 23:58:44.725025 0.7842720510095642
2020-04-02 23:59:11.482546 0.7805139186295503
2020-04-02 23:59:41.000804 0.784796573875803
2020-04-02 23:59:41.028585   Min 0.788971367974549
2020-04-02 23:59:41.028630   Ma

2020-04-03 00:36:29.118368 0.7735951991271139
2020-04-03 00:36:47.913447 0.5944615384615385
2020-04-03 00:37:14.643460 0.7857528332433891
2020-04-03 00:37:44.029761 0.7805676855895196
2020-04-03 00:38:13.327511 0.780278670953912
2020-04-03 00:38:46.001998 0.7725752508361203
2020-04-03 00:39:10.701852 0.7899343544857768
2020-04-03 00:39:37.016989 0.7840971838763113
2020-04-03 00:40:02.066807 0.7914046121593291
2020-04-03 00:40:29.716979 0.7827015483182063
2020-04-03 00:41:08.665305 0.7901487942534633
2020-04-03 00:41:35.633851 0.7867450561197221
2020-04-03 00:41:53.340918 0.0
2020-04-03 00:42:20.900673 0.7933796049119062
2020-04-03 00:42:45.859011 0.7774855339295108
2020-04-03 00:43:11.764332 0.7763226820324778
2020-04-03 00:43:38.490415 0.7780149413020278
2020-04-03 00:44:10.598911 0.7732902530963921
2020-04-03 00:44:36.653360 0.7801418439716311
2020-04-03 00:45:04.898065 0.7901234567901233
2020-04-03 00:45:37.706320 0.7825171142706687
2020-04-03 00:46:16.725329 0.7882736156351792
2020