In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import timeit
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from os import path
import csv



In [2]:
class utility:

    def append_df_to_excel(self, df, excel_path):
        if path.isfile(excel_path):
            df_excel = pd.read_excel(excel_path)
            result = pd.concat([df_excel, df], ignore_index=True)
            result.to_excel(excel_path, index=False)
        else:
            df.to_excel(excel_path, index=False)

    def read_CSV(self, filename):
        df = pd.read_csv(filename)
        return df

    def get_text_label(self, df):
        texts = []  # list of text samples
        labels = []  # list of label ids
        for index, row in df.iterrows():
            if isinstance(row['text_cleaned'], float):
                texts.append(str(row['text_cleaned']))
            else:
                texts.append(row['text_cleaned'])

            labels.append(row['target'])

        return texts, labels

    def tokenize_texts(self, texts):
        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
        tokenizer.fit_on_texts(texts)

        return tokenizer

    def padding_texts(self, texts, maxlen):

        texts = tf.keras.preprocessing.sequence.pad_sequences(texts, padding='post', maxlen=maxlen)

        return texts

    def get_testing_metric(self, y_test, y_pred):
        accuracyScore = accuracy_score(y_test, y_pred)
        precisionScore= precision_score(y_test, y_pred)
        recallScore = recall_score(y_test, y_pred)
        f1Score = f1_score(y_test, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def write_df_csv(self, df, out_path):
        df.to_csv(out_path, index=False)

    def create_embedding_matrix(self, filepath, word_index, embedding_dim):
        vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
        embedding_matrix = np.zeros((vocab_size, embedding_dim))

        with open(filepath, encoding="utf8") as f:
            for line in f:
                word, *vector = line.split()
                if word in word_index:
                    idx = word_index[word]
                    embedding_matrix[idx] = np.array(
                        vector, dtype=np.float32)[:embedding_dim]

        return embedding_matrix

    def get_max_length_of_sentences(self, texts):
        maxlength = 0
        for text in texts:
            if (len(text.split()) > maxlength):
                maxlength = len(text.split())

        return maxlength

    def get_training_test_data(self, textsTraining, textsTest, labelsTraining, labelsTest):
        textsTraining, textsTesting = np.asarray(textsTraining), np.asarray(textsTest)
        y_train, y_test = np.asarray(labelsTraining), np.asarray(labelsTest)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)
        X_test = tokenizer.texts_to_sequences(textsTesting)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)
        X_test = self.padding_texts(X_test, maxlen)

        return X_train, X_test, y_train, y_test

    def get_X_Y_data(self, textsTraining, labelsTraining):
        textsTraining = np.asarray(textsTraining)
        y_train = np.asarray(labelsTraining)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)

        return X_train, y_train

    def Average(self, list):
        return sum(list) / len(list)
    
    def recall_m(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision_m(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def f1_m(self, y_true, y_pred):
        precision = self.precision_m(y_true, y_pred)
        recall = self.recall_m(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [3]:
training_path = 'trainPreprocessed.csv'
root_path = '/lab/dbms/fatyanosa'
datasetPath = '{}/Dataset/Disaster Tweets/'.format(root_path)
resultsPath = '{}/Server2/Disaster Tweets/Paper GA-CNN/Results/'.format(root_path)
testing_name = "TPOT_defaultsettings"

In [4]:
if __name__ == '__main__':
    util = utility()
    n_run = 3
    # Read data
    df = util.read_CSV(datasetPath + training_path)
    texts, labels = util.get_text_label(df)
    X_train, y_train = util.get_X_Y_data(texts, labels)
    indices = np.arange(df.shape[0])    
    textsTraining, textsTest, labelsTraining, labelsTest, idx1, idx2 = train_test_split(
        X_train, y_train, indices, test_size = 0.3, random_state=42)
    
    # Create Testing Results
    f = open(resultsPath + testing_name + ".csv", "a+")
    f.write("i,score,time,winning_pipe\n")
    f.close()

    custom_cv = [(idx1, idx2)]
    for i in range(0, n_run):
        tpot = TPOTClassifier(scoring='f1', verbosity=2, cv=custom_cv)
        start_time = timeit.default_timer()
        tpot.fit(X_train, y_train)

        winning_pipe=tpot.fitted_pipeline_      

        my_dict = list(tpot.evaluated_individuals_.items())

        model_scores = []
        indiv = pd.DataFrame()
        
        for model in my_dict:        
            model_name = model[0]
            model_info = model[1]
            cv_score = model[1].get('internal_cv_score')
            model_scores.append(cv_score)
            indiv = indiv.append({'generation': model[1]['generation'],'model': model_name,
                                        'cv_score': cv_score,
                                        'model_info': model_info,},
                                       ignore_index=True)

        util.append_df_to_excel(indiv, r"{}{}.xlsx".format(resultsPath, testing_name))

        tpot.export(resultsPath + testing_name + 'tpot_pipeline'+ str(i + 1) +'.py')

        elapsed = timeit.default_timer() - start_time
        # elapsed = elapsed/60
        # print('Time:', elapsed)
        # print('Score:', max(model_scores))
        # print('Winning pipeline:', winning_pipe)

        # save testing data
        f = open(resultsPath + testing_name + ".csv", 'a')
        f.write(str(i + 1)
              + ',' + str(max(model_scores))
              + ',' + str(elapsed) 
              + ',' + str(winning_pipe) 
              + '\n')
        f.close()

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10100.0, style=ProgressStyle(…


Generation 1 - Current best internal CV score: 0.6203904555314533
Generation 2 - Current best internal CV score: 0.6203904555314533
Generation 3 - Current best internal CV score: 0.622568093385214
Generation 4 - Current best internal CV score: 0.6278505579815623
Generation 5 - Current best internal CV score: 0.6278505579815623
Generation 6 - Current best internal CV score: 0.6313465783664459
Generation 7 - Current best internal CV score: 0.6313465783664459
Generation 8 - Current best internal CV score: 0.6313465783664459
Generation 9 - Current best internal CV score: 0.6325741297808336
Generation 10 - Current best internal CV score: 0.6325741297808336
Generation 11 - Current best internal CV score: 0.6374077112387203
Generation 12 - Current best internal CV score: 0.6374077112387203
Generation 13 - Current best internal CV score: 0.6374077112387203
Generation 14 - Current best internal CV score: 0.6435084672166739
Generation 15 - Current best internal CV score: 0.6435084672166739
Gene

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10100.0, style=ProgressStyle(…


Generation 1 - Current best internal CV score: 0.6016583747927032
Generation 2 - Current best internal CV score: 0.6139339968569931
Generation 3 - Current best internal CV score: 0.6141895390989124
Generation 4 - Current best internal CV score: 0.6256135653726015
Generation 5 - Current best internal CV score: 0.6256135653726015
Generation 6 - Current best internal CV score: 0.6256135653726015
Generation 7 - Current best internal CV score: 0.6256224535989136
Generation 8 - Current best internal CV score: 0.6256224535989136
Generation 9 - Current best internal CV score: 0.6256224535989136
Generation 10 - Current best internal CV score: 0.6285178236397749
Generation 11 - Current best internal CV score: 0.6285178236397749
Generation 12 - Current best internal CV score: 0.6285178236397749
Generation 13 - Current best internal CV score: 0.6285178236397749
Generation 14 - Current best internal CV score: 0.628948609146629
Generation 15 - Current best internal CV score: 0.628948609146629
Gener

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10100.0, style=ProgressStyle(…


Generation 1 - Current best internal CV score: 0.6016471608149112
Generation 2 - Current best internal CV score: 0.6016583747927032
Generation 3 - Current best internal CV score: 0.6066584463625154
Generation 4 - Current best internal CV score: 0.6111350884764782
Generation 5 - Current best internal CV score: 0.6168898043254376
Generation 6 - Current best internal CV score: 0.6168898043254376
Generation 7 - Current best internal CV score: 0.6168898043254376
Generation 8 - Current best internal CV score: 0.6220281416787967
Generation 9 - Current best internal CV score: 0.6305147058823529
Generation 10 - Current best internal CV score: 0.6305147058823529
Generation 11 - Current best internal CV score: 0.6305147058823529
Generation 12 - Current best internal CV score: 0.6305147058823529
Generation 13 - Current best internal CV score: 0.6305147058823529
Generation 14 - Current best internal CV score: 0.6305147058823529
Generation 15 - Current best internal CV score: 0.6305147058823529
Gen