# Utility

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras import backend as K

Using TensorFlow backend.


In [2]:
class utility:

    def read_CSV(self, filename):
        df = pd.read_csv(filename)
        return df

    def get_text_label(self, df):
        texts = []  # list of text samples
        labels = []  # list of label ids
        for index, row in df.iterrows():
            if isinstance(row['text_cleaned'], float):
                texts.append(str(row['text_cleaned']))
            else:
                texts.append(row['text_cleaned'])

            labels.append(row['target'])

        return texts, labels

    def tokenize_texts(self, texts):
        tokenizer = Tokenizer(num_words=50000)
        tokenizer.fit_on_texts(texts)

        return tokenizer

    def padding_texts(self, texts, maxlen):

        texts = pad_sequences(texts, padding='post', maxlen=maxlen)

        return texts

    def get_metric(self, y_true, y_pred):
        accuracyScore = accuracy_score(y_true, y_pred)

        precisionScore = precision_score(y_true, y_pred)
        recallScore = recall_score(y_true, y_pred)
        f1Score = f1_score(y_true, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def print_metric(self, accuracyScore, precisionScore, recallScore, f1Score):
        print("Accuracy: {}".format(str(accuracyScore)))
        print("Precision: {}".format(str(precisionScore)))
        print("Recall: {}".format(str(recallScore)))
        print("F1-Score: {}".format(str(f1Score)))
        print("{},{},{},{}".format(str(accuracyScore), str(precisionScore), str(recallScore), str(f1Score)))
        

    def get_testing_metric(self, y_test, y_pred):
        accuracyScore, precisionScore, recallScore, f1Score = self.get_metric(y_test, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def write_df_csv(self, df, out_path):
        df.to_csv(out_path, index=False)

    def create_embedding_matrix(self, filepath, word_index, embedding_dim):
        vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
        embedding_matrix = np.zeros((vocab_size, embedding_dim))

        with open(filepath, encoding="utf8") as f:
            for line in f:
                word, *vector = line.split()
                if word in word_index:
                    idx = word_index[word]
                    embedding_matrix[idx] = np.array(
                        vector, dtype=np.float32)[:embedding_dim]

        return embedding_matrix

    def get_max_length_of_sentences(self, texts):
        maxlength = 0
        for text in texts:
            if (len(text.split()) > maxlength):
                maxlength = len(text.split())

        return maxlength

    def get_training_trial_data(self, textsTraining, labelsTraining, textsTrial, labelsTrial, glovePath):
        textsTraining, textsTesting = np.asarray(textsTraining), np.asarray(textsTrial)
        y_train, y_val = np.asarray(labelsTraining), np.asarray(labelsTrial)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)
        X_val = tokenizer.texts_to_sequences(textsTesting)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)
        X_val = self.padding_texts(X_val, maxlen)

        embedding_matrix = []
        embedding_matrix.append(self.create_embedding_matrix(glovePath[0], tokenizer.word_index, 50))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[1], tokenizer.word_index, 100))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[2], tokenizer.word_index, 200))
        embedding_matrix.append(self.create_embedding_matrix(glovePath[3], tokenizer.word_index, 300))

        return X_train, X_val, y_train, y_val, vocab_size, maxlen, embedding_matrix

    def Average(self, list):
        return sum(list) / len(list)

    def recall(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def f1_score(self, y_true, y_pred):
        precision = self.precision(y_true, y_pred)
        recall = self.recall(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Finite State Machine

In [3]:
import random

def FSM():
    fsm = {}
    fsm[0] = {'src': 0, 'dst': 1, 'layer': 'embedding_layer', 'next_path': [1]}
    fsm[1] = {'src': 1, 'dst': 2, 'layer': 'convolutional_layer', 'next_path': [2, 4]}
    fsm[2] = {'src': 2, 'dst': 3, 'layer': 'maxpooling_layer', 'next_path': [3]}    
    fsm[3] = {'src': 3, 'dst': 2, 'layer': 'convolutional_layer', 'next_path': [2, 4]}
    fsm[4] = {'src': 2, 'dst': 4, 'layer': 'global_maxpooling_layer', 'next_path': [5]}
    fsm[5] = {'src': 4, 'dst': 5, 'layer': 'dense_layer', 'next_path': [6, 7]}
    fsm[6] = {'src': 5, 'dst': 5, 'layer': 'dense_layer', 'next_path': [6, 7]}    
    fsm[7] = {'src': 5, 'dst': 6, 'layer': 'dropout_layer', 'next_path': [8]}
    fsm[8] = {'src': 6, 'dst': 7, 'layer': 'output_layer', 'next_path': []}

    return fsm

def getLayerSize(layer, conv_idx, dense_idx, dropout_idx, maxpooling_idx):
    if layer == 'convolutional_layer':
        conv_idx += 1
    elif layer == 'dense_layer':
        dense_idx += 1
    elif layer == 'dropout_layer':
        dropout_idx += 1
    elif layer == 'maxpooling_layer':
        maxpooling_idx += 1
    return conv_idx, dense_idx, dropout_idx, maxpooling_idx


def getMaxLayerSize(conv_idx, dense_idx, dropout_idx, maxpooling_idx, max_conv_idx, max_dense_idx, max_dropout_idx,
                    max_maxpooling_idx):
    if conv_idx > max_conv_idx:
        max_conv_idx = conv_idx
    if dense_idx > max_dense_idx:
        max_dense_idx = dense_idx
    if dropout_idx > max_dropout_idx:
        max_dropout_idx = dropout_idx
    if maxpooling_idx > max_maxpooling_idx:
        max_maxpooling_idx = maxpooling_idx

    return max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx

def generateFSM(n_pop):
    fsm = FSM()

    path_ind = {}
    max_conv_idx = 0
    max_dense_idx = 0
    max_dropout_idx = 0
    max_maxpooling_idx = 0

    for ind in range(0, n_pop):
        idx = conv_idx = dense_idx = dropout_idx = maxpooling_idx = 0
        path = [fsm[idx]['layer']]
        while len(fsm[idx]['next_path']) != 0:
            idx = random.choice(fsm[idx]['next_path'])
            layer = fsm[idx]['layer']
            path.append(layer)
            conv_idx, dense_idx, dropout_idx, maxpooling_idx = getLayerSize(layer, conv_idx, dense_idx, dropout_idx,
                                                                            maxpooling_idx)

        max_conv_idx, max_dense_idx, max_dropout_idx, max_maxpooling_idx = getMaxLayerSize(conv_idx, dense_idx,
                                                                                           dropout_idx, maxpooling_idx,
                                                                                           max_conv_idx, max_dense_idx,
                                                                                           max_dropout_idx,
                                                                                           max_maxpooling_idx)

        path_ind[ind] = path

    return path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx


def openFSM(df):
    path_ind = {}
    fitnesses = []

    hyperparams = [s for s in list(df.columns) if not 'Unnamed' in s]

    max_conv_idx = sum('num_filters' in s for s in hyperparams)
    max_dense_idx = sum('neurons' in s for s in hyperparams)
    max_dropout_idx = sum('dropout_rate' in s for s in hyperparams)
    max_maxpooling_idx = sum('pool_size' in s for s in hyperparams)

    for index, row in df.iterrows():
        path = [s for s in row if 'layer' in str(s)]
        fitness = [s for s in row if str(s).replace('.', '', 1).isdigit()]
        fitnesses.append(tuple([float(fitness[0])]))
        path_ind[index] = path

    return path_ind, fitnesses, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx

# CNN

In [4]:
import tensorflow as tf

class CNN:

    def cnn_model(self, vocab_size, maxlen, embedding_matrix, indiv, path):
        model = tf.keras.models.Sequential()
        conv_idx = dense_idx = dropout_idx = maxpooling_idx = 0
        for layer in path:
            if layer == 'embedding_layer':
                model.add(
                    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=indiv['output_dim'],
                                     weights=[embedding_matrix], input_length=maxlen, trainable=True))
            elif layer == 'convolutional_layer':
                conv_idx += 1
                model.add(tf.keras.layers.Conv1D(indiv['num_filters'], 
                                                 indiv['kernel_size'],
                                        kernel_initializer=indiv['conv_init_mode'],
                                        activation=indiv['conv_activation_func'],
                                        kernel_constraint=tf.keras.constraints.max_norm(indiv['conv_weight_constraint']),
                                        data_format='channels_first'))
            elif layer == 'dense_layer':
                dense_idx += 1
                model.add(tf.keras.layers.Dense(indiv['neurons'],
                                       kernel_initializer=indiv['dense_init_mode'],
                                       activation=indiv['dense_activation_func'],
                                       kernel_constraint=tf.keras.constraints.max_norm(indiv['dense_weight_constraint'])))
            elif layer == 'dropout_layer':
                dropout_idx += 1
                model.add(tf.keras.layers.Dropout(indiv['dropout_rate']))
            elif layer == 'maxpooling_layer':
                maxpooling_idx += 1
                model.add(tf.keras.layers.MaxPooling1D(indiv['pool_size']))
            elif layer == 'global_maxpooling_layer':
                model.add(tf.keras.layers.GlobalMaxPooling1D())
            elif layer == 'output_layer':
                model.add(tf.keras.layers.Dense(1, kernel_initializer=indiv['output_init_mode'], activation='sigmoid'))

        if indiv['optimizer'] == 'sgd':
            opt = tf.keras.optimizers.SGD(lr=indiv['learning_rate'], momentum=indiv['momentum'], decay=0.0,
                                 nesterov=False)
        elif indiv['optimizer'] == 'rmsprop':
            opt = tf.keras.optimizers.RMSprop(lr=indiv['learning_rate'], rho=0.9, epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adagrad':
            opt = tf.keras.optimizers.Adagrad(lr=indiv['learning_rate'], epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adadelta':
            opt = tf.keras.optimizers.Adadelta(lr=indiv['learning_rate'], rho=0.95, epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adam':
            opt = tf.keras.optimizers.Adam(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                  decay=0.0, amsgrad=False)
        elif indiv['optimizer'] == 'adamax':
            opt = tf.keras.optimizers.Adamax(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                    decay=0.0)
        elif indiv['optimizer'] == 'nadam':
            opt = tf.keras.optimizers.Nadam(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                   schedule_decay=0.004)
        
        util = utility()
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[util.f1_score])

        return model

# Fitness Calculation

In [5]:
import collections
import os
from time import sleep
import gc

util = utility()
cnn = CNN()

def FitnessCalculation(individual, cfold, defaultVal, resultsPath, testing_name):
    indiv = collections.OrderedDict()
    i = 0
    for key in defaultVal.keys():
        indiv[key] = individual[i]
        i += 1

    path = individual[len(defaultVal):len(individual)]
    
    return crossfold(indiv, path, cfold, resultsPath, testing_name)


def crossfold(indiv, path, fold, resultsPath, testing_name):
    if indiv['output_dim'] == 50:
        embedding_mtx = fold['embedding_matrix'][0]
    elif indiv['output_dim'] == 100:
        embedding_mtx = fold['embedding_matrix'][1]
    elif indiv['output_dim'] == 200:
        embedding_mtx = fold['embedding_matrix'][2]
    elif indiv['output_dim'] == 300:
        embedding_mtx = fold['embedding_matrix'][3]

    model = cnn.cnn_model(fold['vocab_size'], fold['maxlen'], embedding_mtx,
                          indiv, path)
    
    #early stopping
    #save the best model
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', mode='max', verbose=False, patience=10), 
                 tf.keras.callbacks.ModelCheckpoint("{}{}.h5".format(resultsPath, testing_name), monitor='val_f1_score', mode='max', verbose=False, 
                                  save_best_only=True)]

    model.fit(fold['X_train'], fold['y_train'], epochs=indiv['epochs'], verbose=False, 
              validation_data=(fold['X_val'], fold['y_val']), use_multiprocessing=False,
              batch_size=indiv['batch_size'], callbacks=callbacks)
    
    dependencies = {
    'f1_score': util.f1_score
    }

    # load the saved model
    for x in range(0, 4):  # try 4 times
        try:
            # msg.send()
            saved_model = tf.keras.models.load_model("{}{}.h5".format(resultsPath, testing_name), custom_objects=dependencies)
            str_error = None
        except Exception as e:
            print('An error occurs when loading saved model.')
            str_error = e
            pass

        if str_error:
            sleep(5)  # wait for 2 seconds before trying to fetch the data again
        else:
            break
    

    y_pred = saved_model.predict_classes(fold['X_val'])

    os.remove("{}{}.h5".format(resultsPath, testing_name))
    del embedding_mtx
    gc.collect()

    # CNN metrics
    accuracyScore, precisionScore, recallScore, f1Score = util.get_testing_metric(fold['y_val'], y_pred)
    return f1Score


# Genetic Algorithm

In [6]:
import random
from operator import attrgetter
from deap import base
from deap import creator
from deap import tools
import time
import datetime
import math
from scipy.spatial import distance
import itertools


class GeneticAlgorithm:
    __slots__ = (
        "toolbox", "toolboxes", "cross_rate", "mut_rate", "n_pop", "n_gen", "resultsPath", "testing_name", "cfold",
        "globalparameters", "defaultVal", "path_ind", "max_conv_idx", "max_maxpooling_idx",
        "max_dense_idx", "max_dropout_idx")

    def __init__(self, toolbox, toolboxes, cross_rate, mut_rate, n_pop, n_gen, resultsPath, testing_name,
                 cfold, globalparameters, defaultVal, path_ind, max_conv_idx, max_maxpooling_idx,
                 max_dense_idx, max_dropout_idx):
        self.toolbox = toolbox
        self.toolboxes = toolboxes
        self.cross_rate = cross_rate
        self.mut_rate = mut_rate
        self.n_pop = n_pop
        self.n_gen = n_gen
        self.resultsPath = resultsPath
        self.testing_name = testing_name
        self.cfold = cfold
        self.globalparameters = globalparameters
        self.defaultVal = defaultVal
        self.path_ind = path_ind
        self.max_conv_idx = max_conv_idx
        self.max_maxpooling_idx = max_maxpooling_idx
        self.max_dense_idx = max_dense_idx
        self.max_dropout_idx = max_dropout_idx

    def fitnessCalc(self, individual):
        i = 0
        if len(individual.fitness.values) == 0:
            if (0 in individual or '' in individual or 'False' in individual or None in individual):
                for param in self.defaultVal:
                    if individual[i] == 0 or individual[i] == '' or individual[i] == 'False' or individual[i] == None:
                        individual[i] = self.defaultVal[param]
                    i += 1

            fc = FitnessCalculation(individual, self.cfold, self.defaultVal, self.resultsPath, self.testing_name)
        else:
            fc = individual.fitness.values[0]
        print('{} {}'.format(datetime.datetime.now(), fc))
        return fc,

    def write_result(self):
        # Create Testing Results
        f = open("{}{}.csv".format(self.resultsPath, self.testing_name), "a+")
        text = "i,min,max,mean,std,avgdistance,time,CR,MR"
        for param in self.defaultVal:
            text = "{},{}".format(text, param)
        text = "{}\n".format(text)
        f.write(text)
        f.close()

        # Create Last Population file
        f = open("{}{}lastpop.csv".format(self.resultsPath, self.testing_name), 'a+')
        text = "i,f1score"
        for param in self.defaultVal:
            text = "{},{}".format(text, param)
        text = "{}\n".format(text)
        f.write(text)
        f.close()

    def std_calc(self, fits, length):
        mean = sum(fits) / length
        sum2 = sum(x * x for x in fits)
        std = abs(sum2 / length - mean ** 2) ** 0.5

        return mean, std
    
    def distance_calc(self, pop):
        distances = []
        for subset in itertools.combinations(pop, 2):
            distances.append(distance.hamming(subset[0][0:subset[0].index('embedding_layer')],
                                              subset[1][0:subset[1].index('embedding_layer')]))

        avgDistance = sum(distances) / len(distances)
        
        return avgDistance

    def invalid_fitness_calc(self, pop):
        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in pop if not ind.fitness.valid]
        fitnesses = map(self.toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

    def mutHyperparam(self, individual, indpb):
        toolboxesSize = len(self.toolboxes)
        fsm = FSM()
#         mutatefsm = mutateFSM()
#         addfsm = addFSM()

        # Mutation for the Hyperparameter Chromosomes
        for i in range(toolboxesSize):
            if random.random() < indpb:
                if len(self.toolboxes[i].args) == 1:
                    individual[i] = self.toolboxes[i].func(self.toolboxes[i].args[0])
                else:
                    individual[i] = self.toolboxes[i].func(self.toolboxes[i].args[0], self.toolboxes[i].args[1])

        # Mutation for the Architecture Chromosomes
        archChrom = individual[individual.index('convolutional_layer'):individual.index('output_layer')]
        size = len(archChrom)

        for i in range(1, size):
            if random.random() < indpb:
                if (i>=size):
                    break
                                
                if (archChrom[i] == 'global_maxpooling_layer' or archChrom[i] == 'maxpooling_layer' or archChrom[i] == 'dropout_layer'):
                    continue

                selectMutType = random.randint(0, 1)
                # Remove the layer
                if selectMutType == 0:
#                     print('individual before remove', individual)
                        if (archChrom[i] == 'convolutional_layer') and (archChrom[i+1] == 'maxpooling_layer'):
                            archChrom.remove(archChrom[i])
                            archChrom.remove(archChrom[i])
                            size -= 2
                        elif (archChrom[i] == 'dense_layer'):
                            archChrom.remove(archChrom[i])
                            size -= 1

                # Add a layer
                elif selectMutType == 1:
#                     print('individual before add', individual)
                    if (archChrom[i] == 'convolutional_layer'):                    
                        archChrom.insert(i, 'convolutional_layer')
                        archChrom.insert(i+1, 'maxpooling_layer')
                    elif (archChrom[i] == 'dense_layer'):
                        archChrom.insert(i, 'dense_layer')

                individual[individual.index('convolutional_layer'):individual.index('output_layer')] = archChrom
#                 print('individual after', individual)
        return individual,

    def cxTwoPoint(self, ind1, ind2, pop, offspring):
        # Crossover for hyperparameter chromosomes
        size = ind1.index('embedding_layer')
        selectCxType = random.randint(0, 2)
        # One point crossover
        if selectCxType == 0:
#             print('ind1 before one-point crossover:', ind1)
#             print('ind2 before one-point crossover:', ind2)
            cxpoint = random.randint(1, size - 1)
            ind1[cxpoint:], ind2[cxpoint:] = ind2[cxpoint:], ind1[cxpoint:]
#             print('ind1 after one-point crossover:', ind1)
#             print('ind2 after one-point crossover:', ind2)
        # Two-point crossover
        elif selectCxType == 1:
#             print('ind1 before two-point crossover:', ind1)
#             print('ind2 before two-point crossover:', ind2)
            cxpoint1 = random.randint(1, size - 1)
            cxpoint2 = random.randint(1, size - 1)
            if cxpoint2 >= cxpoint1:
                cxpoint2 += 1
            else:  # Swap the two cx points
                cxpoint1, cxpoint2 = cxpoint2, cxpoint1

            ind1[cxpoint1:cxpoint2], ind2[cxpoint1:cxpoint2] \
                = ind2[cxpoint1:cxpoint2], ind1[cxpoint1:cxpoint2]
#             print('ind1 after two-point crossover:', ind1)
#             print('ind2 after two-point crossover:', ind2)
        # Uniform crossover
        elif selectCxType == 2:
#             print('ind1 before uniform crossover:', ind1)
#             print('ind2 before uniform crossover:', ind2)
            for i in range(size):
                if random.random() < self.cross_rate:
                    ind1[i], ind2[i] = ind2[i], ind1[i]
#             print('ind1 after uniform crossover:', ind1)
#             print('ind2 after uniform crossover:', ind2)

        # Crossover for architecture chromosomes
        # One-cut point crossover from the Global MaxPooling layer
        cxpoint1 = ind1.index('global_maxpooling_layer')
        cxpoint2 = ind2.index('global_maxpooling_layer')
        ind1[cxpoint1:], ind2[cxpoint2:] = ind2[cxpoint2:], ind1[cxpoint1:]

        return ind1, ind2

    def runGA(self, lastPop=[], lastFitnesses=[]):
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)

        self.toolbox.register("individual", tools.initCycle, creator.Individual,
                              self.toolboxes, n=1)
        self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)
        self.toolbox.register("evaluate", self.fitnessCalc)
        self.toolbox.register("mate", self.cxTwoPoint)
        self.toolbox.register("mutate", self.mutHyperparam, indpb=self.mut_rate)
        self.toolbox.register("select", tools.selBest)

        pop = self.toolbox.population(n=self.n_pop)

        idx = 0
        for ind in pop:
            if lastPop:
                ind[:] = lastPop[idx]
            ind.extend(self.path_ind[idx])
            idx += 1
        
        if lastFitnesses:
            # Fitnesses from previous population
            fitnesses = lastFitnesses
        else:
            # Evaluate the entire population
            fitnesses = list(map(self.toolbox.evaluate, pop))

        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit

        self.write_result()
        
        g = 0
        while g < self.n_gen:
            then = time.time()
            g = g + 1
            print('{} {}'.format(datetime.datetime.now(), "-- Generation %i --" % g))          
            
            # Select the next generation individuals
            offspring = self.toolbox.select(pop, len(pop))
            # Clone the selected individuals
            offspring = list(map(self.toolbox.clone, offspring))

            # Apply crossover and mutation on the offspring
            for child1, child2 in zip(offspring[::2], offspring[1::2]):
                if random.random() < self.cross_rate:
                    self.toolbox.mate(child1, child2, pop, offspring)
                    del child1.fitness.values
                    del child2.fitness.values

            for mutant in offspring:
                if random.random() < self.mut_rate:
                    self.toolbox.mutate(mutant)
                    del mutant.fitness.values

            # Evaluate the individuals with an invalid fitness
            self.invalid_fitness_calc(offspring)

            pop[:] = self.toolbox.select(pop + offspring, self.n_pop)

            # Gather all the fitnesses in one list and print the stats
            fits = [ind.fitness.values[0] for ind in pop]

            length = len(pop)
            mean, std = self.std_calc(fits, length)
            avgDistance = self.distance_calc(pop)
            best = max(pop, key=attrgetter("fitness"))
            print('{} {}'.format(datetime.datetime.now(), "  Min %s" % min(fits)))
            print('{} {}'.format(datetime.datetime.now(), "  Max %s" % max(fits)))
            print('{} {}'.format(datetime.datetime.now(), "  Avg %s" % mean))
            print('{} {}'.format(datetime.datetime.now(), "  Std %s" % std))
            print('{} {}'.format(datetime.datetime.now(), "  AvgDistance %s" % avgDistance))
            print('{} {}'.format(datetime.datetime.now(), best))

            now = time.time()
            diff = now - then

            # save testing data
            f = open("{}{}.csv".format(self.resultsPath, self.testing_name), 'a')
            text = "{0},{1},{2},{3},{4},{5},{6},{7},{8}".format(g,min(fits), max(fits), mean, std, avgDistance, diff, self.cross_rate, self.mut_rate)
            for param in best:
                text = "{},{}".format(text, param)
            text = "{}\n".format(text)
            f.write(text)
            f.close()

            # save last population data
            f = open("{}{}lastpop.csv".format(self.resultsPath, self.testing_name), 'a')
            for ind in pop:
                text = "{0},{1}".format(g,ind.fitness.values[0])
                for param in ind:
                    text = "{},{}".format(text, param)
                text = "{}\n".format(text)             
                f.write(text)

            f.close()            

# Project path

In [7]:
training_path = 'trainPreprocessed.csv'
population_path = 'NewPop.csv'
root_path = '/lab/dbms/fatyanosa'
datasetPath = '{}/Dataset/Disaster Tweets/'.format(root_path)
resultsPath = '{}/Server2/Disaster Tweets/Results/'.format(root_path)
testing_name = "Experiment4_GA_CNN"
glovePath = ['{}/Glove/glove.6B.50d.txt'.format(root_path),
             '{}/Glove/glove.6B.100d.txt'.format(root_path),
             '{}/Glove/glove.6B.200d.txt'.format(root_path),
             '{}/Glove/glove.6B.300d.txt'.format(root_path)]

# Parameters

In [8]:
# crossover rate is the probability with which two individuals
cross_rate = 0.8

# mutation rate is the probability for mutating an individual
mut_rate = 0.2

# number of population
n_pop = 30

# number of generation
n_gen = 30

# Main Program

In [None]:
import random
from sklearn.model_selection import StratifiedKFold
from deap import base
import warnings; warnings.simplefilter('ignore')

if __name__ == '__main__':
    globalparameters = []
    globalparameters.append(("epochs", random.randint, 1, 100))
    globalparameters.append(("batch_size", random.randint, 32, 256))
    globalparameters.append(("optimizer", random.choice, ['sgd', 'rmsprop', 
                                                          'adagrad', 'adadelta',
                                                          'adam', 'adamax', 
                                                          'nadam']))
    globalparameters.append(("learning_rate", random.uniform, 1e-4, 1e-2))
    globalparameters.append(("momentum", random.choice, [0.9]))
    globalparameters.append(("output_init_mode", random.choice, ['glorot_uniform']))
    globalparameters.append(("output_dim", random.choice, [100]))
    globalparameters.append(("num_filters", random.randint, 32, 512))
    globalparameters.append(("kernel_size", random.randint, 1, 5))
    globalparameters.append(("conv_activation_func", random.choice,
                                               ['relu', 'softmax', 'elu', 'selu',
                                                'softplus', 'softsign', 'tanh',
                                                'sigmoid', 'hard_sigmoid', 'linear']))
    globalparameters.append(("conv_init_mode", random.choice,
                                         ['zeros',
                                          'ones',
                                          'uniform',
                                          'normal',
                                          'glorot_normal',
                                          'glorot_uniform',
                                          'he_normal',
                                          'he_uniform',
                                          'lecun_normal',
                                          'lecun_uniform']))
    globalparameters.append(("conv_weight_constraint", random.randint, 1, 5))
    globalparameters.append(("neurons", random.randint, 1, 30))
    globalparameters.append(("dense_activation_func", random.choice,
                                                ['relu', 'softmax', 'elu', 'selu',
                                                 'softplus', 'softsign', 'tanh',
                                                 'sigmoid', 'hard_sigmoid', 'linear']))
    globalparameters.append(("dense_init_mode", random.choice,
                                          ['zeros',
                                           'ones',
                                           'uniform',
                                           'normal',
                                           'glorot_normal',
                                           'glorot_uniform',
                                           'he_normal',
                                           'he_uniform',
                                           'lecun_normal',
                                           'lecun_uniform']))
    globalparameters.append(("dense_weight_constraint", random.randint, 1, 5))
    globalparameters.append(("pool_size", random.choice, [5]))
    globalparameters.append(("dropout_rate", random.choice, [0.2]))

    defaultVal = collections.OrderedDict([
        ("epochs", 10),
        ("batch_size", 32),
        ("optimizer", "adam"),
        ("learning_rate", 1e-4),
        ("momentum", 0.9),
        ("output_init_mode", "glorot_uniform"),
        ("output_dim", 100),
        ('num_filters', 64),
        ('kernel_size', 3),
        ('conv_activation_func', "relu"),
        ('conv_init_mode', "glorot_uniform"),
        ('conv_weight_constraint', 3),
        ('neurons', 1),
        ('dense_activation_func', "relu"),
        ('dense_init_mode', "glorot_uniform"),
        ('dense_weight_constraint', 3),
        ('pool_size', 5),
        ('dropout_rate', 0.2)]
    )
    
    # object class
    util = utility()
    toolbox = base.Toolbox()
    toolboxes = []

    # Attribute generator
    for hyper in globalparameters:
        if len(hyper) == 3:
            toolbox.register(hyper[0], hyper[1], hyper[2])
        else:
            toolbox.register(hyper[0], hyper[1], hyper[2], hyper[3])

    toolboxes.append(toolbox.epochs)
    toolboxes.append(toolbox.batch_size)
    toolboxes.append(toolbox.optimizer)
    toolboxes.append(toolbox.learning_rate)
    toolboxes.append(toolbox.momentum)
    toolboxes.append(toolbox.output_init_mode)
    toolboxes.append(toolbox.output_dim)
    toolboxes.append(toolbox.num_filters)
    toolboxes.append(toolbox.kernel_size)
    toolboxes.append(toolbox.conv_activation_func)
    toolboxes.append(toolbox.conv_init_mode)
    toolboxes.append(toolbox.conv_weight_constraint)
    toolboxes.append(toolbox.neurons)
    toolboxes.append(toolbox.dense_activation_func)
    toolboxes.append(toolbox.dense_init_mode)
    toolboxes.append(toolbox.dense_weight_constraint)
    toolboxes.append(toolbox.pool_size)
    toolboxes.append(toolbox.dropout_rate)
    

    path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx = generateFSM(n_pop)

#     # Read population data
#     dfPopulation = util.read_CSV("{}{}".format(resultsPath, population_path))

#     path_ind, fitnesses, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx = openFSM(dfPopulation)
#     dfPopulation = dfPopulation.drop(columns=[col for col in dfPopulation if col not in defaultVal])

#     population = dfPopulation.loc[:, ~dfPopulation.columns.str.match('Unnamed')].values.tolist()

    # Read data
    df = util.read_CSV("{}{}".format(datasetPath, training_path))

    from sklearn.model_selection import train_test_split
    dfTraining, dfTrial = train_test_split(df, test_size = 0.3, random_state=42)

    textsTraining, labelsTraining = util.get_text_label(dfTraining)
    textsTrial, labelsTrial = util.get_text_label(dfTrial)
    cfold = {}

    X_train, X_val, y_train, y_val, vocab_size, maxlen, embedding_matrix = util.get_training_trial_data(
        textsTraining, labelsTraining, textsTrial, labelsTrial, glovePath)
    cfold= {'X_train': X_train, 'X_val': X_val, 'y_train': y_train, 'y_val': y_val, 'vocab_size': vocab_size,
                  'maxlen': maxlen, 'embedding_matrix': embedding_matrix}
                  
    ga = GeneticAlgorithm(toolbox, toolboxes, cross_rate, mut_rate, n_pop, n_gen, resultsPath, testing_name,
                          cfold, globalparameters, defaultVal, path_ind, max_conv_idx, max_maxpooling_idx, max_dense_idx, max_dropout_idx)
    ga.runGA()
#     ga.runGA(population, fitnesses)


2020-04-02 18:42:55.444801 -- Generation 14 --
2020-04-02 18:43:21.317979 0.7682191780821918
2020-04-02 18:43:34.069998 0.7782561894510225
2020-04-02 18:43:49.810592 0.7810140237324704
2020-04-02 18:44:06.839462 0.7736147757255936
2020-04-02 18:44:25.402440 0.7823275862068966
2020-04-02 18:44:47.436378 0.7801268498942917
2020-04-02 18:45:01.378564 0.776470588235294
2020-04-02 18:45:14.016359 0.7806970509383377
2020-04-02 18:45:34.767616 0.7821409359870899
2020-04-02 18:45:45.540955 0.7726775956284153
2020-04-02 18:46:07.510240 0.7828389830508475
2020-04-02 18:46:22.340169 0.783695652173913
2020-04-02 18:46:25.615127 0.0082389289392379
2020-04-02 18:46:47.316837 0.7881981032665965
2020-04-02 18:47:09.232486 0.7807999999999999
2020-04-02 18:47:16.314408 0.0
2020-04-02 18:47:22.803430 0.46724546172059983
2020-04-02 18:47:39.476215 0.7845244492208491
2020-04-02 18:47:48.574479 0.7744401966138722
2020-04-02 18:47:58.918151 0.7737909516380655
2020-04-02 18:47:58.940100   Min 0.785368478
2020

2020-04-02 19:18:14.152323 0.7776628748707342
2020-04-02 19:18:30.359974 0.7835703001579779
2020-04-02 19:18:47.916617 0.7792768483540204
2020-04-02 19:19:09.228322 0.7872913301023154
2020-04-02 19:19:30.910957 0.7837415320479416
2020-04-02 19:19:48.539988 0.7857528332433891
2020-04-02 19:20:05.539212 0.7792768483540204
2020-04-02 19:20:25.137631 0.7776584317937703
2020-04-02 19:20:49.485356 0.7776572668112799
2020-04-02 19:21:06.792987 0.7864184008762322
2020-04-02 19:21:31.432003 0.7798213347346296
2020-04-02 19:21:56.161988 0.7774261603375526
2020-04-02 19:22:17.746921 0.7797522886375875
2020-04-02 19:22:41.802626 0.7789363920750781
2020-04-02 19:23:07.317911 0.783625730994152
2020-04-02 19:23:29.814680 0.7836822329575953
2020-04-02 19:23:52.691594 0.7861333333333332
2020-04-02 19:24:14.680967 0.780539397144368
2020-04-02 19:24:33.902023 0.7789585547290117
2020-04-02 19:24:57.913857 0.7866738312735088
2020-04-02 19:25:15.604254 0.7771245323356495
2020-04-02 19:25:33.021901 0.7803108

2020-04-02 20:05:52.836937 0.7837837837837837
2020-04-02 20:06:16.976586 0.7836822329575953
2020-04-02 20:06:44.534540 0.7797913234486545
2020-04-02 20:07:05.221648 0.7793281653746771
2020-04-02 20:07:05.234424   Min 0.7873154729360305
2020-04-02 20:07:05.234477   Max 0.7902439024390244
2020-04-02 20:07:05.234496   Avg 0.7885244190095724
2020-04-02 20:07:05.234510   Std 0.000883605651637079
2020-04-02 20:07:05.234526   AvgDistance 0.05708812260536427
2020-04-02 20:07:05.234540 [41, 219, 'adamax', 0.0063873, 0.9, 'glorot_uniform', 100, 490, 2, 'hard_sigmoid', 'uniform', 3, 24, 'softmax', 'he_uniform', 1, 5, 0.2, 'embedding_layer', 'convolutional_layer', 'global_maxpooling_layer', 'dense_layer', 'dropout_layer', 'output_layer']
2020-04-02 20:07:05.237091 -- Generation 24 --
2020-04-02 20:07:31.484109 0.7791780821917808
2020-04-02 20:08:00.441065 0.7821782178217822
2020-04-02 20:08:21.653264 0.7732772225144662
2020-04-02 20:08:44.175782 0.7793791574279378
2020-04-02 20:09:09.442753 0.7787

2020-04-02 20:56:37.731618 0.7752021563342318
2020-04-02 20:56:37.744468   Min 0.788806758183738
2020-04-02 20:56:37.744520   Max 0.7947165657677491
2020-04-02 20:56:37.744815   Avg 0.7902627524704595
2020-04-02 20:56:37.744833   Std 0.001580260034273206
2020-04-02 20:56:37.744849   AvgDistance 0.07854406130268235
2020-04-02 20:56:37.744864 [41, 219, 'adamax', 0.0063873, 0.9, 'glorot_uniform', 100, 490, 2, 'tanh', 'uniform', 3, 24, 'softmax', 'he_uniform', 2, 5, 0.2, 'embedding_layer', 'convolutional_layer', 'global_maxpooling_layer', 'dense_layer', 'dropout_layer', 'output_layer']
2020-04-02 20:56:37.747114 -- Generation 29 --
2020-04-02 20:57:00.843924 0.7788259958071279
2020-04-02 20:57:25.865476 0.7822878228782287
2020-04-02 20:57:47.971183 0.7811340752517223
2020-04-02 20:58:18.855384 0.7765843179377016
2020-04-02 20:58:35.495810 0.0
2020-04-02 20:59:04.999320 0.7787056367432151
2020-04-02 20:59:35.922626 0.7826541274817137
2020-04-02 21:00:06.603178 0.788368336025848
2020-04-02 2