# Utility

In [8]:
class utility:

    def append_df_to_excel(self, df, excel_path):
        if path.isfile(excel_path):
            df_excel = pd.read_excel(excel_path)
            result = pd.concat([df_excel, df], ignore_index=True)
            result.to_excel(excel_path, index=False)
        else:
            df.to_excel(excel_path, index=False)

    def read_CSV(self, filename):
        df = pd.read_csv(filename, encoding= 'unicode_escape')
        return df

    def get_text_label(self, df):
        texts = []  # list of text samples
        labels = []  # list of label ids
        for index, row in df.iterrows():
            if isinstance(row['text'], float):
                texts.append(str(row['text']))
            else:
                texts.append(row['text'])

            labels.append(row['sentiment'])

        return texts, labels

    def tokenize_texts(self, texts):
        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
        tokenizer.fit_on_texts(texts)

        return tokenizer

    def padding_texts(self, texts, maxlen):

        texts = tf.keras.preprocessing.sequence.pad_sequences(texts, padding='post', maxlen=maxlen)

        return texts

    def get_testing_metric(self, y_test, y_pred):
        accuracyScore = accuracy_score(y_test, y_pred)
        precisionScore= precision_score(y_test, y_pred)
        recallScore = recall_score(y_test, y_pred)
        f1Score = f1_score(y_test, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def write_df_csv(self, df, out_path):
        df.to_csv(out_path, index=False)

    def create_embedding_matrix(self, filepath, word_index, embedding_dim):
        vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
        embedding_matrix = np.zeros((vocab_size, embedding_dim))

        with open(filepath, encoding="utf8") as f:
            for line in f:
                word, *vector = line.split()
                if word in word_index:
                    idx = word_index[word]
                    embedding_matrix[idx] = np.array(
                        vector, dtype=np.float32)[:embedding_dim]

        return embedding_matrix

    def get_max_length_of_sentences(self, texts):
        maxlength = 0
        for text in texts:
            if (len(text.split()) > maxlength):
                maxlength = len(text.split())

        return maxlength

    def get_training_trial_data(self, textsTraining, textsTrial, labelsTraining, labelsTrial):
        textsTraining, textsTesting = np.asarray(textsTraining), np.asarray(textsTrial)
        y_train, y_val = np.asarray(labelsTraining), np.asarray(labelsTrial)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)
        X_val = tokenizer.texts_to_sequences(textsTesting)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)
        X_val = self.padding_texts(X_val, maxlen)
        
        return X_train, X_val, y_train, y_val

    def get_X_Y_data(self, textsTraining, labelsTraining):
        textsTraining = np.asarray(textsTraining)
        y_train = np.asarray(labelsTraining)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)

        return X_train, y_train

    def Average(self, list):
        return sum(list) / len(list)
    
    def recall_m(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision_m(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def f1_m(self, y_true, y_pred):
        precision = self.precision_m(y_true, y_pred)
        recall = self.recall_m(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [9]:
import tensorflow as tf


class CNN:

    def cnn_model(self, vocab_size, maxlen, embedding_matrix, indiv, path):
        model = tf.keras.models.Sequential()
        conv_idx = dense_idx = dropout_idx = maxpooling_idx = 0
        for layer in path:
            if layer == 'embedding_layer':
                model.add(
                    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=indiv['output_dim'],
                                     weights=[embedding_matrix], input_length=maxlen, trainable=True))
            elif layer == 'convolutional_layer':
                conv_idx += 1
                model.add(tf.keras.layers.Conv1D(indiv['num_filters' + str(conv_idx)], indiv['kernel_size' + str(conv_idx)],
                                        kernel_initializer=indiv['conv_init_mode' + str(conv_idx)],
                                        activation=indiv['conv_activation_func' + str(conv_idx)],
                                        kernel_constraint=tf.keras.constraints.max_norm(indiv['conv_weight_constraint' + str(conv_idx)]),
                                        data_format='channels_first'))
            elif layer == 'dense_layer':
                dense_idx += 1
                model.add(tf.keras.layers.Dense(indiv['neurons' + str(dense_idx)],
                                       kernel_initializer=indiv['dense_init_mode' + str(dense_idx)],
                                       activation=indiv['dense_activation_func' + str(dense_idx)],
                                       kernel_constraint=tf.keras.constraints.max_norm(indiv['dense_weight_constraint' + str(dense_idx)])))
            elif layer == 'dropout_layer':
                dropout_idx += 1
                model.add(tf.keras.layers.Dropout(indiv['dropout_rate' + str(dropout_idx)]))
            elif layer == 'maxpooling_layer':
                maxpooling_idx += 1
                model.add(tf.keras.layers.MaxPooling1D(indiv['pool_size' + str(maxpooling_idx)]))
            elif layer == 'global_maxpooling_layer':
                model.add(tf.keras.layers.GlobalMaxPooling1D())
            elif layer == 'output_layer':
                model.add(tf.keras.layers.Dense(1, kernel_initializer=indiv['output_init_mode'], activation='sigmoid'))

        if indiv['optimizer'] == 'sgd':
            opt = tf.keras.optimizers.SGD(lr=indiv['learning_rate'], momentum=indiv['momentum'], decay=0.0,
                                 nesterov=False)
        elif indiv['optimizer'] == 'rmsprop':
            opt = tf.keras.optimizers.RMSprop(lr=indiv['learning_rate'], rho=0.9, epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adagrad':
            opt = tf.keras.optimizers.Adagrad(lr=indiv['learning_rate'], epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adadelta':
            opt = tf.keras.optimizers.Adadelta(lr=indiv['learning_rate'], rho=0.95, epsilon=None, decay=0.0)
        elif indiv['optimizer'] == 'adam':
            opt = tf.keras.optimizers.Adam(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                  decay=0.0, amsgrad=False)
        elif indiv['optimizer'] == 'adamax':
            opt = tf.keras.optimizers.Adamax(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                    decay=0.0)
        elif indiv['optimizer'] == 'nadam':
            opt = tf.keras.optimizers.Nadam(lr=indiv['learning_rate'], beta_1=0.9, beta_2=0.999, epsilon=None,
                                   schedule_decay=0.004)

        util = utility()
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[util.f1_m])

        return model

In [10]:
training_path = 'trainval.csv'
testing_path = 'test.csv'
root_path = '/lab/dbms/fatyanosa'
datasetPath = '{}/Dataset/Twitter US Airline Sentiment/'.format(root_path)
resultsPath = '{}/Server1/Twitter US Airline Sentiment/Paper DGGA-CNN/Results/'.format(root_path)
testing_name = "TPOT_test"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:
# import random
# from sklearn.model_selection import StratifiedKFold
# from deap import base
# import warnings; warnings.simplefilter('ignore')
# import timeit

# if __name__ == '__main__':
#     util = utility()
#     # Read data
#     dfTraining = util.read_CSV(datasetPath + training_path)
    
#     # Read trial data
#     dfTrial = util.read_CSV(datasetPath + trial_path)

#     textsTraining, labelsTraining = util.get_text_label(dfTraining)
#     textsTrial, labelsTrial = util.get_text_label(dfTrial)
#     cfold = {}

#     X_train, X_val, y_train, y_val, vocab_size, maxlen, embedding_matrix = util.get_training_trial_data(
#         textsTraining, labelsTraining, textsTrial, labelsTrial, glovePath)
                  
#     from tpot import TPOTClassifier
#     tpot = TPOTClassifier(generations=100, population_size=30, mutation_rate=0.2, crossover_rate=0.8, early_stop=10, scoring='f1')
#     start_time = timeit.default_timer()
#     tpot.fit(X_train, y_train)
#     elapsed = timeit.default_timer() - start_time
#     winning_pipe=tpot.fitted_pipeline_
#     score=tpot.score(X_val, y_val) 
#     tpot.export(resultsPath+'tpot_mnist_pipeline.py')
#     elapsed = elapsed/60
#     print('Time:', elapsed)
#     print('Score:', score)   
#     print('Winning pipeline:', winning_pipe)

In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from copy import copy
import time
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# features = tpot_data.drop('target', axis=1)
# training_features, testing_features, training_target, testing_target = \
#             train_test_split(features, tpot_data['target'], random_state=None)
util = utility()
n_run = 30

# Read data
dfTraining = util.read_CSV(datasetPath + training_path)

# Read trial data
dfTest = util.read_CSV(datasetPath + testing_path)

textsTraining, labelsTraining = util.get_text_label(dfTraining)
textsTest, labelsTest = util.get_text_label(dfTest)

X_train, X_test, y_train, y_test = util.get_training_trial_data(
    textsTraining, textsTest, labelsTraining, labelsTest)

# Create Testing Results
f = open(resultsPath + testing_name + ".csv", "w+")
f.write("i,accuracy,precision,recall,f1Score,time\n")
f.close()
for i in range(0, n_run):
    then = time.time()
    exported_pipeline = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier(learning_rate=0.1, max_depth=8, max_features=0.6000000000000001, min_samples_leaf=2, min_samples_split=6, n_estimators=100, subsample=1.0))
    
    exported_pipeline.fit(X_train, y_train)
    
    y_pred = exported_pipeline.predict(X_test)

    # CNN metrics
    accuracyScore, precisionScore, recallScore, f1Score = util.get_testing_metric(y_test, y_pred)

    now = time.time()
    diff = now - then
    print(diff)
    print(accuracyScore)

    # save testing data
    f = open(resultsPath + testing_name + ".csv", 'a')
    f.write(str(i + 1)
            + ',' + str(accuracyScore)
            + ',' + str(precisionScore)
            + ',' + str(recallScore)
            + ',' + str(f1Score)
            + ',' + str(diff) + '\n')
    f.close()


8.29472041130066
0.6351842241826673
8.304448127746582
0.6335078534031414
8.307457208633423
0.6257098606091895
8.287773370742798
0.6452282157676348
8.285102605819702
0.6378772112382933
8.30013656616211
0.6352819451629592
8.31404185295105
0.6315245478036176
8.350783109664917
0.6315240083507307
8.330127954483032
0.6303219106957424
8.293934345245361
0.6276762402088774
8.28831171989441
0.6285119667013528
8.251240491867065
0.6298802706923478
8.300720930099487
0.6246056782334386
8.347569465637207
0.6238821672803787
8.335811138153076
0.6340956340956341
8.32011365890503
0.6342472840144853
8.307449102401733
0.6378772112382933
8.32282018661499
0.6292834890965732
8.296455144882202
0.6416666666666666
8.334139823913574
0.6244106862231535
8.335028171539307
0.6327800829875518
8.345573663711548
0.6267423851316468
8.364209651947021
0.6384734399174833
8.313401460647583
0.631083202511774
8.327104091644287
0.6350858927641854
8.321603536605835
0.6436420722135009
8.299376010894775
0.6350515463917527
8.306139