# Utility

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import backend as K

Using TensorFlow backend.


In [2]:
class utility:

    def read_CSV(self, filename):
        df = pd.read_csv(filename)
        return df

    def get_text_label(self, df):
        texts = []  # list of text samples
        labels = []  # list of label ids
        for index, row in df.iterrows():
            if isinstance(row['text_cleaned'], float):
                texts.append(str(row['text_cleaned']))
            else:
                texts.append(row['text_cleaned'])

            labels.append(row['target'])

        return texts, labels

    def tokenize_texts(self, texts):
        tokenizer = Tokenizer(num_words=50000)
        tokenizer.fit_on_texts(texts)

        return tokenizer

    def padding_texts(self, texts, maxlen):

        texts = pad_sequences(texts, padding='post', maxlen=maxlen)

        return texts

    def get_metric(self, y_true, y_pred):
        accuracyScore = accuracy_score(y_true, y_pred)

        precisionScore = precision_score(y_true, y_pred)
        recallScore = recall_score(y_true, y_pred)
        f1Score = f1_score(y_true, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def print_metric(self, accuracyScore, precisionScore, recallScore, f1Score):
        print("Accuracy: {}".format(str(accuracyScore)))
        print("Precision: {}".format(str(precisionScore)))
        print("Recall: {}".format(str(recallScore)))
        print("F1-Score: {}".format(str(f1Score)))
        print("{},{},{},{}".format(str(accuracyScore), str(precisionScore), str(recallScore), str(f1Score)))
        

    def get_testing_metric(self, y_test, y_pred):
        accuracyScore, precisionScore, recallScore, f1Score = self.get_metric(y_test, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def write_df_csv(self, df, out_path):
        df.to_csv(out_path, index=False)

    def create_embedding_matrix(self, filepath, word_index, embedding_dim):
        vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
        embedding_matrix = np.zeros((vocab_size, embedding_dim))

        with open(filepath, encoding="utf8") as f:
            for line in f:
                word, *vector = line.split()
                if word in word_index:
                    idx = word_index[word]
                    embedding_matrix[idx] = np.array(
                        vector, dtype=np.float32)[:embedding_dim]

        return embedding_matrix

    def get_max_length_of_sentences(self, texts):
        maxlength = 0
        for text in texts:
            if (len(text.split()) > maxlength):
                maxlength = len(text.split())

        return maxlength

    def get_training_trial_data(self, textsTraining, labelsTraining, textsTrial, labelsTrial, glovePath):
        textsTraining, textsTesting = np.asarray(textsTraining), np.asarray(textsTrial)
        y_train, y_val = np.asarray(labelsTraining), np.asarray(labelsTrial)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)
        X_val = tokenizer.texts_to_sequences(textsTesting)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)
        X_val = self.padding_texts(X_val, maxlen)

        embedding_matrix = []
        embedding_matrix.append(self.create_embedding_matrix(glovePath[0], tokenizer.word_index, 50))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[1], tokenizer.word_index, 100))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[2], tokenizer.word_index, 200))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[3], tokenizer.word_index, 300))

        return X_train, X_val, y_train, y_val, vocab_size, maxlen, embedding_matrix

    def Average(self, list):
        return sum(list) / len(list)

    def recall(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def f1_score(self, y_true, y_pred):
        precision = self.precision(y_true, y_pred)
        recall = self.recall(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Finite State Machine

In [3]:
import random

def FSM():
    fsm = {}
    fsm[0] = {'src': 0, 'dst': 1, 'layer': 'embedding_layer', 'next_path': [1]}
    fsm[1] = {'src': 1, 'dst': 2, 'layer': 'convolutional_layer', 'next_path': [2, 4]}
    fsm[2] = {'src': 2, 'dst': 3, 'layer': 'maxpooling_layer', 'next_path': [3]}    
    fsm[3] = {'src': 3, 'dst': 2, 'layer': 'convolutional_layer', 'next_path': [2, 4]}
    fsm[4] = {'src': 2, 'dst': 4, 'layer': 'global_maxpooling_layer', 'next_path': [5]}
    fsm[5] = {'src': 4, 'dst': 5, 'layer': 'dense_layer', 'next_path': [6, 7]}
    fsm[6] = {'src': 5, 'dst': 5, 'layer': 'dense_layer', 'next_path': [6, 7]}    
    fsm[7] = {'src': 5, 'dst': 6, 'layer': 'dropout_layer', 'next_path': [8]}
    fsm[8] = {'src': 6, 'dst': 7, 'layer': 'output_layer', 'next_path': []}

    return fsm

def getLayerSize(layer, conv_idx, dense_idx, dropout_idx, maxpooling_idx):
    if layer == 'convolutional_layer':
        conv_idx += 1
    elif layer == 'dense_layer':
        dense_idx += 1
    elif layer == 'dropout_layer':
        dropout_idx += 1
    elif layer == 'maxpooling_layer':
        maxpooling_idx += 1
    return conv_idx, dense_idx, dropout_idx, maxpooling_idx


def getMaxLayerSize(conv_idx, dense_idx, dropout_idx, maxpooling_idx, max_conv_idx, max_dense_idx, max_dropout_idx,
                    max_maxpooling_idx):
    if conv_idx > max_conv_idx:
        max_conv_idx = conv_idx
    if dense_idx > max_dense_idx:
        max_dense_idx = dense_idx
    if dropout_idx > max_dropout_idx:
        max_dropout_idx = dropout_idx
    if maxpooling_idx > max_maxpooling_idx:
        max_maxpooling_idx = maxpooling_idx

    return max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx

def generateFSM(n_pop):
    fsm = FSM()

    path_ind = {}
    max_conv_idx = 0
    max_dense_idx = 0
    max_dropout_idx = 0
    max_maxpooling_idx = 0

    for ind in range(0, n_pop):
        idx = conv_idx = dense_idx = dropout_idx = maxpooling_idx = 0
        path = [fsm[idx]['layer']]
        while len(fsm[idx]['next_path']) != 0:
            idx = random.choice(fsm[idx]['next_path'])
            layer = fsm[idx]['layer']
            path.append(layer)
            conv_idx, dense_idx, dropout_idx, maxpooling_idx = getLayerSize(layer, conv_idx, dense_idx, dropout_idx,
                                                                            maxpooling_idx)

        max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx = getMaxLayerSize(conv_idx, dense_idx,
                                                                                           dropout_idx, maxpooling_idx,
                                                                                           max_conv_idx, max_dense_idx,
                                                                                           max_dropout_idx,
                                                                                           max_maxpooling_idx)

        path_ind[ind] = path

    return path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx


def openFSM(df):
    path_ind = {}
    fitnesses = []

    hyperparams = [s for s in list(df.columns) if not 'Unnamed' in s]

    max_conv_idx = sum('num_filters' in s for s in hyperparams)
    max_dense_idx = sum('neurons' in s for s in hyperparams)
    max_dropout_idx = sum('dropout_rate' in s for s in hyperparams)
    max_maxpooling_idx = sum('pool_size' in s for s in hyperparams)

    for index, row in df.iterrows():
        path = [s for s in row if 'layer' in str(s)]
        fitness = [s for s in row if str(s).replace('.', '', 1).isdigit()]
        fitnesses.append(tuple([float(fitness[0])]))
        path_ind[index] = path

    return path_ind, fitnesses, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx

# CNN

In [4]:
import tensorflow as tf

class CNN:

    def cnn_model(self, vocab_size, maxlen, embedding_matrix, indiv, path):
        model = tf.keras.models.Sequential()
        conv_idx = dense_idx = dropout_idx = maxpooling_idx = 0
        for layer in path:
            if layer == 'embedding_layer':
                model.add(
                    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=indiv['output_dim'],
                                     weights=[embedding_matrix], input_length=maxlen, trainable=True))
            elif layer == 'convolutional_layer':
                conv_idx += 1
                model.add(tf.keras.layers.Conv1D(indiv['num_filters'], 
                                                 indiv['kernel_size'],
                                        kernel_initializer=indiv['conv_init_mode'],
                                        activation=indiv['conv_activation_func'],
                                        kernel_constraint=tf.keras.constraints.max_norm(indiv['conv_weight_constraint']),
                                        data_format='channels_first'))
            elif layer == 'dense_layer':
                dense_idx += 1
                model.add(tf.keras.layers.Dense(indiv['neurons'],
                                       kernel_initializer=indiv['dense_init_mode'],
                                       activation=indiv['dense_activation_func'],
                                       kernel_constraint=tf.keras.constraints.max_norm(indiv['dense_weight_constraint'])))
            elif layer == 'dropout_layer':
                dropout_idx += 1
                model.add(tf.keras.layers.Dropout(indiv['dropout_rate']))
            elif layer == 'maxpooling_layer':
                maxpooling_idx += 1
                model.add(tf.keras.layers.MaxPooling1D(indiv['pool_size']))
            elif layer == 'global_maxpooling_layer':
                model.add(tf.keras.layers.GlobalMaxPooling1D())
            elif layer == 'output_layer':
                model.add(tf.keras.layers.Dense(1, kernel_initializer=indiv['output_init_mode'], activation='sigmoid'))

        if indiv['optimizer'] == 'sgd':
            opt = tf.keras.optimizers.SGD(lr=indiv['learning_rate'], momentum=indiv['momentum'], decay=0.0,
                                 nesterov=False)
        elif indiv['optimizer'] == 'rmsprop':
            opt = tf.keras.optimizers.RMSprop(lr=indiv['learning_rate'], rho=0.9, epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adagrad':
            opt = tf.keras.optimizers.Adagrad(lr=indiv['learning_rate'], epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adadelta':
            opt = tf.keras.optimizers.Adadelta(lr=indiv['learning_rate'], rho=0.95, epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adam':
            opt = tf.keras.optimizers.Adam(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                  decay=0.0, amsgrad=False)
        elif indiv['optimizer'] == 'adamax':
            opt = tf.keras.optimizers.Adamax(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                    decay=0.0)
        elif indiv['optimizer'] == 'nadam':
            opt = tf.keras.optimizers.Nadam(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                   schedule_decay=0.004)
        
        util = utility()
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[util.f1_score])

        return model

# Fitness Calculation

In [5]:
import collections
import os
from time import sleep
import gc

util = utility()
cnn = CNN()

def FitnessCalculation(individual, cfold, defaultVal, resultsPath, testing_name):
    indiv = collections.OrderedDict()
    i = 0
    for key in defaultVal.keys():
        indiv[key] = individual[i]
        i += 1

    path = individual[len(defaultVal):len(individual)]
    
    return crossfold(indiv, path, cfold, resultsPath, testing_name)


def crossfold(indiv, path, fold, resultsPath, testing_name):
    if indiv['output_dim'] == 50:
        embedding_mtx = fold['embedding_matrix'][0]
    elif indiv['output_dim'] == 100:
        embedding_mtx = fold['embedding_matrix'][1]
    elif indiv['output_dim'] == 200:
        embedding_mtx = fold['embedding_matrix'][2]
    elif indiv['output_dim'] == 300:
        embedding_mtx = fold['embedding_matrix'][3]

    model = cnn.cnn_model(fold['vocab_size'], fold['maxlen'], embedding_mtx,
                          indiv, path)
    
    #early stopping
    #save the best model
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', mode='max', verbose=False, patience=10), 
                 tf.keras.callbacks.ModelCheckpoint("{}{}.h5".format(resultsPath, testing_name), monitor='val_f1_score', mode='max', verbose=False, 
                                  save_best_only=True)]

    # class_weight = {0: 0.25,
    #                 1: 0.75}
    model.fit(fold['X_train'], fold['y_train'], epochs=indiv['epochs'], verbose=False, 
              validation_data=(fold['X_val'], fold['y_val']), use_multiprocessing=False,
              batch_size=indiv['batch_size'], callbacks=callbacks)
    
    dependencies = {
    'f1_score': util.f1_score
    }

    # load the saved model
    for x in range(0, 4):  # try 4 times
        try:
            # msg.send()
            saved_model = tf.keras.models.load_model("{}{}.h5".format(resultsPath, testing_name), custom_objects=dependencies)
            str_error = None
        except Exception as e:
            print('An error occurs when loading saved model.')
            str_error = e
            pass

        if str_error:
            sleep(5)  # wait for 2 seconds before trying to fetch the data again
        else:
            break
    

    y_pred = saved_model.predict_classes(fold['X_val'])

    os.remove("{}{}.h5".format(resultsPath, testing_name))
    del embedding_mtx
    gc.collect()

    # CNN metrics
    accuracyScore, precisionScore, recallScore, f1Score = util.get_testing_metric(fold['y_val'], y_pred)
    return f1Score


# Genetic Algorithm

In [6]:
import random
from operator import attrgetter
from deap import base
from deap import creator
from deap import tools
import time
import datetime
import math
from scipy.spatial import distance
import itertools


class GeneticAlgorithm:
    __slots__ = (
        "toolbox", "toolboxes", "cross_rate", "mut_rate", "n_pop", "n_gen", "resultsPath", "testing_name", "cfold",
        "globalparameters", "defaultVal", "path_ind", "max_conv_idx", "max_maxpooling_idx",
        "max_dense_idx", "max_dropout_idx")

    def __init__(self, toolbox, toolboxes, cross_rate, mut_rate, n_pop, n_gen, resultsPath, testing_name,
                 cfold, globalparameters, defaultVal, path_ind, max_conv_idx, max_maxpooling_idx,
                 max_dense_idx, max_dropout_idx):
        self.toolbox = toolbox
        self.toolboxes = toolboxes
        self.cross_rate = cross_rate
        self.mut_rate = mut_rate
        self.n_pop = n_pop
        self.n_gen = n_gen
        self.resultsPath = resultsPath
        self.testing_name = testing_name
        self.cfold = cfold
        self.globalparameters = globalparameters
        self.defaultVal = defaultVal
        self.path_ind = path_ind
        self.max_conv_idx = max_conv_idx
        self.max_maxpooling_idx = max_maxpooling_idx
        self.max_dense_idx = max_dense_idx
        self.max_dropout_idx = max_dropout_idx

    def fitnessCalc(self, individual):
        i = 0
        if len(individual.fitness.values) == 0:
            if (0 in individual or '' in individual or 'False' in individual or None in individual):
                for param in self.defaultVal:
                    if individual[i] == 0 or individual[i] == '' or individual[i] == 'False' or individual[i] == None:
                        individual[i] = self.defaultVal[param]
                    i += 1

            fc = FitnessCalculation(individual, self.cfold, self.defaultVal, self.resultsPath, self.testing_name)
        else:
            fc = individual.fitness.values[0]
        print('{} {}'.format(datetime.datetime.now(), fc))
        return fc,

    def write_result(self):
        # Create Testing Results
        f = open("{}{}.csv".format(self.resultsPath, self.testing_name), "a+")
        text = "i,min,max,mean,std,avgdistance,time,CR,MR"
        for param in self.defaultVal:
            text = "{},{}".format(text, param)
        text = "{}\n".format(text)
        f.write(text)
        f.close()

        # Create Last Population file
        f = open("{}{}lastpop.csv".format(self.resultsPath, self.testing_name), 'a+')
        text = "i,f1score"
        for param in self.defaultVal:
            text = "{},{}".format(text, param)
        text = "{}\n".format(text)
        f.write(text)
        f.close()

    def std_calc(self, fits, length):
        mean = sum(fits) / length
        sum2 = sum(x * x for x in fits)
        std = abs(sum2 / length - mean ** 2) ** 0.5

        return mean, std
    
    def distance_calc(self, pop):
        distances = []
        for subset in itertools.combinations(pop, 2):
            distances.append(distance.hamming(subset[0][0:subset[0].index('embedding_layer')],
                                              subset[1][0:subset[1].index('embedding_layer')]))

        avgDistance = sum(distances) / len(distances)
        
        return avgDistance

    def invalid_fitness_calc(self, pop):
        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in pop if not ind.fitness.valid]
        fitnesses = map(self.toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

    def mutHyperparam(self, individual, indpb):
        toolboxesSize = len(self.toolboxes)
        fsm = FSM()

        # Mutation for the Hyperparameter Chromosomes
        for i in range(toolboxesSize):
            if random.random() < indpb:
                if len(self.toolboxes[i].args) == 1:
                    individual[i] = self.toolboxes[i].func(self.toolboxes[i].args[0])
                else:
                    individual[i] = self.toolboxes[i].func(self.toolboxes[i].args[0], self.toolboxes[i].args[1])

        # Mutation for the Architecture Chromosomes
        archChrom = individual[individual.index('convolutional_layer'):individual.index('output_layer')]
        size = len(archChrom)

        for i in range(1, size):
            if random.random() < indpb:
                if (i>=size):
                    break
                                
                if (archChrom[i] == 'global_maxpooling_layer' or archChrom[i] == 'maxpooling_layer' or archChrom[i] == 'dropout_layer'):
                    continue

                selectMutType = random.randint(0, 1)
                # Remove the layer
                if selectMutType == 0:
#                     print('individual before remove', individual)
                        if (archChrom[i] == 'convolutional_layer') and (archChrom[i+1] == 'maxpooling_layer'):
                            archChrom.remove(archChrom[i])
                            archChrom.remove(archChrom[i])
                            size -= 2
                        elif (archChrom[i] == 'dense_layer'):
                            archChrom.remove(archChrom[i])
                            size -= 1

                # Add a layer
                elif selectMutType == 1:
#                     print('individual before add', individual)
                    if (archChrom[i] == 'convolutional_layer'):                    
                        archChrom.insert(i, 'convolutional_layer')
                        archChrom.insert(i+1, 'maxpooling_layer')
                    elif (archChrom[i] == 'dense_layer'):
                        archChrom.insert(i, 'dense_layer')

                individual[individual.index('convolutional_layer'):individual.index('output_layer')] = archChrom
#                 print('individual after', individual)
        return individual,

    def cxTwoPoint(self, ind1, ind2, pop, offspring):
        # Crossover for hyperparameter chromosomes
        size = ind1.index('embedding_layer')
        selectCxType = random.randint(0, 2)
        # One point crossover
        if selectCxType == 0:
#             print('ind1 before one-point crossover:', ind1)
#             print('ind2 before one-point crossover:', ind2)
            cxpoint = random.randint(1, size - 1)
            ind1[cxpoint:], ind2[cxpoint:] = ind2[cxpoint:], ind1[cxpoint:]
#             print('ind1 after one-point crossover:', ind1)
#             print('ind2 after one-point crossover:', ind2)
        # Two-point crossover
        elif selectCxType == 1:
#             print('ind1 before two-point crossover:', ind1)
#             print('ind2 before two-point crossover:', ind2)
            cxpoint1 = random.randint(1, size - 1)
            cxpoint2 = random.randint(1, size - 1)
            if cxpoint2 >= cxpoint1:
                cxpoint2 += 1
            else:  # Swap the two cx points
                cxpoint1, cxpoint2 = cxpoint2, cxpoint1

            ind1[cxpoint1:cxpoint2], ind2[cxpoint1:cxpoint2] \
                = ind2[cxpoint1:cxpoint2], ind1[cxpoint1:cxpoint2]
#             print('ind1 after two-point crossover:', ind1)
#             print('ind2 after two-point crossover:', ind2)
        # Uniform crossover
        elif selectCxType == 2:
#             print('ind1 before uniform crossover:', ind1)
#             print('ind2 before uniform crossover:', ind2)
            for i in range(size):
                if random.random() < self.cross_rate:
                    ind1[i], ind2[i] = ind2[i], ind1[i]
#             print('ind1 after uniform crossover:', ind1)
#             print('ind2 after uniform crossover:', ind2)

        # Crossover for architecture chromosomes
        # One-cut point crossover from the Global MaxPooling layer
        cxpoint1 = ind1.index('global_maxpooling_layer')
        cxpoint2 = ind2.index('global_maxpooling_layer')
        ind1[cxpoint1:], ind2[cxpoint2:] = ind2[cxpoint2:], ind1[cxpoint1:]

        return ind1, ind2

    def runGA(self, lastPop=[], lastFitnesses=[]):
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)

        self.toolbox.register("individual", tools.initCycle, creator.Individual,
                              self.toolboxes, n=1)
        self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)
        self.toolbox.register("evaluate", self.fitnessCalc)
        self.toolbox.register("mate", self.cxTwoPoint)
        self.toolbox.register("mutate", self.mutHyperparam, indpb=self.mut_rate)
        self.toolbox.register("select", tools.selBest)

        pop = self.toolbox.population(n=self.n_pop)

        idx = 0
        for ind in pop:
            if lastPop:
                ind[:] = lastPop[idx]
            ind.extend(self.path_ind[idx])
            idx += 1
        
        if lastFitnesses:
            # Fitnesses from previous population
            fitnesses = lastFitnesses
        else:
            # Evaluate the entire population
            fitnesses = list(map(self.toolbox.evaluate, pop))

        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit

        self.write_result()
        
        g = 0
        while g < self.n_gen:
            then = time.time()
            g = g + 1
            print('{} {}'.format(datetime.datetime.now(), "-- Generation %i --" % g))          
            
            # Select the next generation individuals
            offspring = self.toolbox.select(pop, len(pop))
            # Clone the selected individuals
            offspring = list(map(self.toolbox.clone, offspring))

            # Apply crossover and mutation on the offspring
            for child1, child2 in zip(offspring[::2], offspring[1::2]):
                if random.random() < self.cross_rate:
                    self.toolbox.mate(child1, child2, pop, offspring)
                    del child1.fitness.values
                    del child2.fitness.values

            for mutant in offspring:
                if random.random() < self.mut_rate:
                    self.toolbox.mutate(mutant)
                    del mutant.fitness.values

            # Evaluate the individuals with an invalid fitness
            self.invalid_fitness_calc(offspring)

            pop[:] = self.toolbox.select(pop + offspring, self.n_pop)

            # Gather all the fitnesses in one list and print the stats
            fits = [ind.fitness.values[0] for ind in pop]

            length = len(pop)
            mean, std = self.std_calc(fits, length)
            avgDistance = self.distance_calc(pop)
            best = max(pop, key=attrgetter("fitness"))
            print('{} {}'.format(datetime.datetime.now(), "  Min %s" % min(fits)))
            print('{} {}'.format(datetime.datetime.now(), "  Max %s" % max(fits)))
            print('{} {}'.format(datetime.datetime.now(), "  Avg %s" % mean))
            print('{} {}'.format(datetime.datetime.now(), "  Std %s" % std))
            print('{} {}'.format(datetime.datetime.now(), "  AvgDistance %s" % avgDistance))
            print('{} {}'.format(datetime.datetime.now(), best))

            now = time.time()
            diff = now - then

            # save testing data
            f = open("{}{}.csv".format(self.resultsPath, self.testing_name), 'a')
            text = "{0},{1},{2},{3},{4},{5},{6},{7},{8}".format(g,min(fits), max(fits), mean, std, avgDistance, diff, self.cross_rate, self.mut_rate)
            for param in best:
                text = "{},{}".format(text, param)
            text = "{}\n".format(text)
            f.write(text)
            f.close()

            # save last population data
            f = open("{}{}lastpop.csv".format(self.resultsPath, self.testing_name), 'a')
            for ind in pop:
                text = "{0},{1}".format(g,ind.fitness.values[0])
                for param in ind:
                    text = "{},{}".format(text, param)
                text = "{}\n".format(text)             
                f.write(text)

            f.close()            

# Project path

In [7]:
import os 
# path

training_path = 'trainPreprocessed.csv'
population_path = 'NewPop.csv'
root_path = '/lab/dbms/fatyanosa'
datasetPath = '{}/Dataset/Disaster Tweets/'.format(root_path)
resultsPath = '{}/Server2/Disaster Tweets/Results/'.format(root_path)
testing_name = "Experiment2_GA-CNN"
glovePath = ['{}/Glove/glove.6B.50d.txt'.format(root_path),
             '{}/Glove/glove.6B.100d.txt'.format(root_path),
             '{}/Glove/glove.6B.200d.txt'.format(root_path),
             '{}/Glove/glove.6B.300d.txt'.format(root_path)]

# Parameters

In [8]:
# crossover rate is the probability with which two individuals
cross_rate = 0.8

# mutation rate is the probability for mutating an individual
mut_rate = 0.2

# number of population
n_pop = 30

# number of generation
n_gen = 30

# Main Program

In [9]:
import random
from sklearn.model_selection import StratifiedKFold
from deap import base
import warnings; warnings.simplefilter('ignore')

if __name__ == '__main__':
    globalparameters = []
    globalparameters.append(("epochs", random.randint, 1, 100))
    globalparameters.append(("batch_size", random.randint, 32, 256))
    globalparameters.append(("optimizer", random.choice, ['sgd', 'rmsprop', 
                                                          'adagrad', 'adadelta',
                                                          'adam', 'adamax', 
                                                          'nadam']))
    globalparameters.append(("learning_rate", random.uniform, 1e-4, 1e-2))
    globalparameters.append(("momentum", random.choice, [0.9]))
    globalparameters.append(("output_init_mode", random.choice, ['glorot_uniform']))
    globalparameters.append(("output_dim", random.choice, [100]))
    globalparameters.append(("num_filters", random.randint, 32, 512))
    globalparameters.append(("kernel_size", random.randint, 1, 5))
    globalparameters.append(("conv_activation_func", random.choice,
                                               ['relu', 'softmax', 'elu', 'selu',
                                                'softplus', 'softsign', 'tanh',
                                                'sigmoid', 'hard_sigmoid', 'linear']))
    globalparameters.append(("conv_init_mode",random.choice,
                                         ['glorot_uniform']))
    globalparameters.append(("conv_weight_constraint", random.choice, [3]))
    globalparameters.append(("neurons", random.randint, 1, 30))
    globalparameters.append(("dense_activation_func", random.choice,
                                                ['relu']))
    globalparameters.append(("dense_init_mode", random.choice,
                                          ['glorot_uniform']))
    globalparameters.append(("dense_weight_constraint", random.choice, [3]))
    globalparameters.append(("pool_size", random.choice, [5]))
    globalparameters.append(("dropout_rate", random.choice, [0.2]))

    defaultVal = collections.OrderedDict([
        ("epochs", 10),
        ("batch_size", 32),
        ("optimizer", "adam"),
        ("learning_rate", 1e-4),
        ("momentum", 0.9),
        ("output_init_mode", "glorot_uniform"),
        ("output_dim", 100),
        ('num_filters', 64),
        ('kernel_size', 3),
        ('conv_activation_func', "relu"),
        ('conv_init_mode', "glorot_uniform"),
        ('conv_weight_constraint', 3),
        ('neurons', 1),
        ('dense_activation_func', "relu"),
        ('dense_init_mode', "glorot_uniform"),
        ('dense_weight_constraint', 3),
        ('pool_size', 5),
        ('dropout_rate', 0.2)]
    )
    
    # object class
    util = utility()
    toolbox = base.Toolbox()
    toolboxes = []

    # Attribute generator
    for hyper in globalparameters:
        if len(hyper) == 3:
            toolbox.register(hyper[0], hyper[1], hyper[2])
        else:
            toolbox.register(hyper[0], hyper[1], hyper[2], hyper[3])

    toolboxes.append(toolbox.epochs)
    toolboxes.append(toolbox.batch_size)
    toolboxes.append(toolbox.optimizer)
    toolboxes.append(toolbox.learning_rate)
    toolboxes.append(toolbox.momentum)
    toolboxes.append(toolbox.output_init_mode)
    toolboxes.append(toolbox.output_dim)
    toolboxes.append(toolbox.num_filters)
    toolboxes.append(toolbox.kernel_size)
    toolboxes.append(toolbox.conv_activation_func)
    toolboxes.append(toolbox.conv_init_mode)
    toolboxes.append(toolbox.conv_weight_constraint)
    toolboxes.append(toolbox.neurons)
    toolboxes.append(toolbox.dense_activation_func)
    toolboxes.append(toolbox.dense_init_mode)
    toolboxes.append(toolbox.dense_weight_constraint)
    toolboxes.append(toolbox.pool_size)
    toolboxes.append(toolbox.dropout_rate)
    

    path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx = generateFSM(n_pop)

#     # Read population data
#     dfPopulation = util.read_CSV("{}{}".format(resultsPath, population_path))

#     path_ind, fitnesses, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx = openFSM(dfPopulation)
#     dfPopulation = dfPopulation.drop(columns=[col for col in dfPopulation if col not in defaultVal])

#     population = dfPopulation.loc[:, ~dfPopulation.columns.str.match('Unnamed')].values.tolist()

    # Read data
    df = util.read_CSV("{}{}".format(datasetPath, training_path))

    from sklearn.model_selection import train_test_split
    dfTraining, dfTrial = train_test_split(df, test_size = 0.3, random_state=42)

    textsTraining, labelsTraining = util.get_text_label(dfTraining)
    textsTrial, labelsTrial = util.get_text_label(dfTrial)
    cfold = {}

    X_train, X_val, y_train, y_val, vocab_size, maxlen, embedding_matrix = util.get_training_trial_data(
        textsTraining, labelsTraining, textsTrial, labelsTrial, glovePath)
    cfold= {'X_train': X_train, 'X_val': X_val, 'y_train': y_train, 'y_val': y_val, 'vocab_size': vocab_size,
                  'maxlen': maxlen, 'embedding_matrix': embedding_matrix}
                  
    ga = GeneticAlgorithm(toolbox, toolboxes, cross_rate, mut_rate, n_pop, n_gen, resultsPath, testing_name,
                          cfold, globalparameters, defaultVal, path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx)
    ga.runGA()
#     ga.runGA(population, fitnesses)


2020-04-01 06:44:20.332071 -- Generation 31 --
2020-04-01 06:45:07.990509 0.7622724765581907
2020-04-01 06:45:45.690183 0.7755319148936172
2020-04-01 06:46:48.186173 0.7804090419806242
2020-04-01 06:47:21.359981 0.784375
2020-04-01 06:48:36.732500 0.7808591625883632
2020-04-01 06:49:40.985806 0.7747344885410843
2020-04-01 06:49:52.715056 0.7810335641981887
2020-04-01 06:50:02.174826 0.7747368421052631
2020-04-01 06:50:13.476687 0.7755319148936172
2020-04-01 06:50:23.642287 0.7743250127356088
2020-04-01 06:50:35.156381 0.7668613913967074
2020-04-01 06:50:45.085278 0.7764578833693304
2020-04-01 06:51:00.872744 0.778969957081545
2020-04-01 06:51:14.551988 0.7794037940379404
2020-04-01 06:51:25.650193 0.7663755458515283
2020-04-01 06:51:35.410567 0.7714129841789416
2020-04-01 06:51:57.297936 0.7624602332979852
2020-04-01 06:52:13.233751 0.782608695652174
2020-04-01 06:52:57.561787 0.7803278688524589
2020-04-01 06:53:37.362989 0.7755775577557756
2020-04-01 06:54:12.411891 0.7795698924731183

2020-04-01 07:40:52.530689 0.7747551686615886
2020-04-01 07:41:06.833502 0.7758444216990789
2020-04-01 07:41:19.960730 0.7743933918430562
2020-04-01 07:41:32.390998 0.7767416346681295
2020-04-01 07:41:45.503349 0.7751196172248803
2020-04-01 07:41:59.108916 0.7703180212014133
2020-04-01 07:42:11.590149 0.7707367336424523
2020-04-01 07:42:24.178581 0.7556968733439322
2020-04-01 07:43:02.487795 0.7810140237324704
2020-04-01 07:43:40.844210 0.7780104712041883
2020-04-01 07:44:13.625074 0.7773625200213561
2020-04-01 07:44:46.808409 0.772930648769575
2020-04-01 07:44:46.821789   Min 0.788793103448275
2020-04-01 07:44:46.821831   Max 0.790348525469168
2020-04-01 07:44:46.821847   Avg 0.7892721816184891
2020-04-01 07:44:46.821870   Std 0.00036364561839089725
2020-04-01 07:44:46.821892   AvgDistance 0.18403575989782894
2020-04-01 07:44:46.821907 [87, 204, 'adamax', 0.0038595172970700004, 0.9, 'glorot_uniform', 100, 315, 4, 'hard_sigmoid', 'glorot_uniform', 3, 15, 'relu', 'glorot_uniform', 3, 5,

2020-04-01 08:31:24.718816 0.7816593886462883
2020-04-01 08:33:03.839106 0.7827493261455526
2020-04-01 08:34:26.984427 0.775
2020-04-01 08:34:39.335239 0.0
2020-04-01 08:35:50.252053 0.7737775389575496
2020-04-01 08:36:44.225627 0.7643243243243244
2020-04-01 08:36:59.508730 0.7785016286644952
2020-04-01 08:37:16.088733 0.7670250896057348
2020-04-01 08:37:31.569104 0.7747458533975389
2020-04-01 08:37:47.332494 0.7816938453445555
2020-04-01 08:38:06.435944 0.7663650878126662
2020-04-01 08:38:21.332496 0.0
2020-04-01 08:38:38.166251 0.7681473456121344
2020-04-01 08:39:02.676005 0.7775354416575792
2020-04-01 08:39:23.638941 0.7840971838763113
2020-04-01 08:39:40.759434 0.7702407002188184
2020-04-01 08:39:57.918799 0.7717750826901874
2020-04-01 08:40:14.837472 0.7714437932871604
2020-04-01 08:40:33.095723 0.7644584647739222
2020-04-01 08:40:48.741643 0.7760217983651225
2020-04-01 08:40:48.755327   Min 0.789280084077772
2020-04-01 08:40:48.755379   Max 0.790348525469168
2020-04-01 08:40:48.7

2020-04-01 09:40:52.704194 0.7780149413020278
2020-04-01 09:41:41.419157 0.7753164556962026
2020-04-01 09:42:10.955791 0.7740916271721958
2020-04-01 09:42:51.791823 0.7832699619771862
2020-04-01 09:43:56.252858 0.7731456415809421
2020-04-01 09:45:14.298661 0.781657113079729
2020-04-01 09:46:04.137829 0.7605485232067511
2020-04-01 09:47:25.442392 0.7806731813246471
2020-04-01 09:48:12.079266 0.7793326157158236
2020-04-01 09:48:35.308328 0.7667410714285715
2020-04-01 09:49:01.535860 0.7780104712041883
2020-04-01 09:49:47.636457 0.7705352411234764
2020-04-01 09:50:51.984971 0.7828911748781807
2020-04-01 09:52:01.236153 0.7759914255091105
2020-04-01 09:53:27.501465 0.7780821917808219
2020-04-01 09:54:46.282114 0.7894453419493807
2020-04-01 09:55:49.320877 0.7754237288135593
2020-04-01 09:56:13.413037 0.6829880728185813
2020-04-01 09:56:32.912329 0.7754065040650406
2020-04-01 09:56:51.436220 0.78076525336091
2020-04-01 09:57:12.287407 0.7816711590296496
2020-04-01 09:57:33.894819 0.76914949

2020-04-01 11:33:39.159105 0.779826464208243
2020-04-01 11:34:24.221294 0.7862796833773087
2020-04-01 11:35:06.907346 0.7807775377969762
2020-04-01 11:35:45.547601 0.779826464208243
2020-04-01 11:36:30.917211 0.7827015483182063
2020-04-01 11:37:15.358930 0.777415852334419
2020-04-01 11:37:54.265724 0.773917691074292
2020-04-01 11:38:34.768460 0.7774122807017544
2020-04-01 11:39:17.007460 0.7800875273522976
2020-04-01 11:40:00.794368 0.7775377969762419
2020-04-01 11:40:47.369735 0.782051282051282
2020-04-01 11:41:23.564463 0.7774798927613941
2020-04-01 11:41:56.606116 0.7670171555063642
2020-04-01 11:42:38.864578 0.7765333333333334
2020-04-01 11:44:17.721764 0.7786752827140548
2020-04-01 11:45:53.533233 0.7837837837837837
2020-04-01 11:47:02.196757 0.7825613079019075
2020-04-01 11:48:37.426751 0.7866596082583378
2020-04-01 11:49:58.503781 0.7692307692307692
2020-04-01 11:51:32.201839 0.7834051724137931
2020-04-01 11:52:43.336632 0.7779596290234588
2020-04-01 11:54:01.543746 0.7763300760