In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import timeit
from sklearn.model_selection import train_test_split
import numpy as np
from os import path
import csv
import autosklearn.classification
import pickle

  self.re = re.compile(self.reString)


In [2]:
class utility:

    def append_df_to_excel(self, df, excel_path):
        if path.isfile(excel_path):
            df_excel = pd.read_excel(excel_path)
            result = pd.concat([df_excel, df], ignore_index=True)
            result.to_excel(excel_path, index=False)
        else:
            df.to_excel(excel_path, index=False)

    def read_CSV(self, filename):
        df = pd.read_csv(filename)
        return df

    def get_text_label(self, df):
        texts = []  # list of text samples
        labels = []  # list of label ids
        for index, row in df.iterrows():
            if isinstance(row['sentence'], float):
                texts.append(str(row['sentence']))
            else:
                texts.append(row['sentence'])

            labels.append(row['label'])

        return texts, labels

    def tokenize_texts(self, texts):
        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
        tokenizer.fit_on_texts(texts)

        return tokenizer

    def padding_texts(self, texts, maxlen):

        texts = tf.keras.preprocessing.sequence.pad_sequences(texts, padding='post', maxlen=maxlen)

        return texts

    def get_testing_metric(self, y_test, y_pred):
        accuracyScore = accuracy_score(y_test, y_pred)
        precisionScore= precision_score(y_test, y_pred)
        recallScore = recall_score(y_test, y_pred)
        f1Score = f1_score(y_test, y_pred)

        return accuracyScore, precisionScore, recallScore, f1Score

    def write_df_csv(self, df, out_path):
        df.to_csv(out_path, index=False)

    def create_embedding_matrix(self, filepath, word_index, embedding_dim):
        vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
        embedding_matrix = np.zeros((vocab_size, embedding_dim))

        with open(filepath, encoding="utf8") as f:
            for line in f:
                word, *vector = line.split()
                if word in word_index:
                    idx = word_index[word]
                    embedding_matrix[idx] = np.array(
                        vector, dtype=np.float32)[:embedding_dim]

        return embedding_matrix

    def get_max_length_of_sentences(self, texts):
        maxlength = 0
        for text in texts:
            if (len(text.split()) > maxlength):
                maxlength = len(text.split())

        return maxlength

    def get_training_trial_data(self, textsTraining, textsTrial, labelsTraining, labelsTrial):
        textsTraining, textsTesting = np.asarray(textsTraining), np.asarray(textsTrial)
        y_train, y_val = np.asarray(labelsTraining), np.asarray(labelsTrial)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)
        X_val = tokenizer.texts_to_sequences(textsTesting)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)
        X_val = self.padding_texts(X_val, maxlen)
        
        return X_train, X_val, y_train, y_val

    def get_X_Y_data(self, textsTraining, labelsTraining):
        textsTraining = np.asarray(textsTraining)
        y_train = np.asarray(labelsTraining)

        # Tokenize words
        tokenizer = self.tokenize_texts(textsTraining)
        X_train = tokenizer.texts_to_sequences(textsTraining)

        # Adding 1 because of reserved 0 index
        vocab_size = len(tokenizer.word_index) + 1

        # get maxlen
        maxlen = self.get_max_length_of_sentences(textsTraining)

        # Pad sequences with zeros
        X_train = self.padding_texts(X_train, maxlen)

        return X_train, y_train

    def Average(self, list):
        return sum(list) / len(list)
    
    def recall_m(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision_m(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def f1_m(self, y_true, y_pred):
        precision = self.precision_m(y_true, y_pred)
        recall = self.recall_m(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [3]:
training_path = 'TrainingData.csv'
trial_path = 'TrialData.csv'
testing_path = 'EvaluationData.csv'
root_path = '/lab/dbms/fatyanosa'
datasetPath = '{}/Dataset/Suggestion Mining/'.format(root_path)
resultsPath = '{}/Server1/Suggestion Mining/Results/'.format(root_path)
testing_name = "Auto-Sklearn_1hr"

In [4]:
if __name__ == '__main__':
    util = utility()
    n_run = 5
        
    # Read data
    dfTraining = util.read_CSV(datasetPath + training_path)
    
    # Read trial data
    dfTrial = util.read_CSV(datasetPath + trial_path)
    
    textsTraining, labelsTraining = util.get_text_label(dfTraining)
    textsTrial, labelsTrial = util.get_text_label(dfTrial)

    X_train, X_val, y_train, y_val = util.get_training_trial_data(
    textsTraining, textsTrial, labelsTraining, labelsTrial)
    
    # Create Testing Results
    f = open(resultsPath + testing_name + ".csv", "a+")
    f.write("i,score,time\n")
    f.close()
    
    scorer = autosklearn.metrics.make_scorer(
          'f1_score',
          f1_score
    )

    for i in range(0, n_run):
        start_time = timeit.default_timer()      

        cls = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=3600)

        score = cls.fit(X_train, y_train, metric=scorer).score(X_val, y_val)
        pickle.dump(cls, open(resultsPath + testing_name + str(i + 1) + '.pickle', 'wb'))
        elapsed = timeit.default_timer() - start_time  

        # save testing data
        f = open(resultsPath + testing_name + ".csv", 'a')
        f.write(str(i + 1)
              + ',' + str(score)
              + ',' + str(elapsed)
              + '\n')
        f.close()



In [5]:
# import numpy as np
# import pandas as pd
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import BernoulliNB
# from sklearn.pipeline import make_pipeline, make_union
# from sklearn.preprocessing import RobustScaler
# from sklearn.svm import LinearSVC
# from tpot.builtins import StackingEstimator
# from sklearn.preprocessing import FunctionTransformer
# from copy import copy
# import time

# # NOTE: Make sure that the outcome column is labeled 'target' in the data file
# # tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# # features = tpot_data.drop('target', axis=1)
# # training_features, testing_features, training_target, testing_target = \
# #             train_test_split(features, tpot_data['target'], random_state=None)
# util = utility()
# n_run = 30

# # Read data
# dfTraining = util.read_CSV(datasetPath + training_path)

# # Read trial data
# dfTest = util.read_CSV(datasetPath + testing_path)

# textsTraining, labelsTraining = util.get_text_label(dfTraining)
# textsTest, labelsTest = util.get_text_label(dfTest)

# X_train, X_test, y_train, y_test = util.get_training_test_data(
#     textsTraining, textsTest, labelsTraining, labelsTest)

# # Create Testing Results
# f = open(resultsPath + testing_name + ".csv", "w+")
# f.write("i,accuracy,precision,recall,f1Score,time\n")
# f.close()
# for i in range(0, n_run):
#     then = time.time()
#     # Average CV score on the training set was: 0.6534839924670433
#     exported_pipeline = make_pipeline(
#         make_union(
#             FunctionTransformer(copy),
#             RobustScaler()
#         ),
#         StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=2, max_features=0.6500000000000001, min_samples_leaf=17, min_samples_split=17, n_estimators=100, subsample=0.6000000000000001)),
#         StackingEstimator(estimator=LinearSVC(C=0.0001, dual=True, loss="hinge", penalty="l2", tol=0.1)),
#         StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=7, max_features=0.8500000000000001, min_samples_leaf=6, min_samples_split=17, n_estimators=100, subsample=0.9000000000000001)),
#         RobustScaler(),
#         BernoulliNB(alpha=10.0, fit_prior=False)
#     )

#     exported_pipeline.fit(X_train, y_train)
#     y_pred = exported_pipeline.predict(X_test)

#     # CNN metrics
#     accuracyScore, precisionScore, recallScore, f1Score = util.get_testing_metric(y_test, y_pred)

#     now = time.time()
#     diff = now - then
#     print(diff)
#     print(f1Score)

#     # save testing data
#     f = open(resultsPath + testing_name + ".csv", 'a')
#     f.write(str(i + 1)
#             + ',' + str(accuracyScore)
#             + ',' + str(precisionScore)
#             + ',' + str(recallScore)
#             + ',' + str(f1Score)
#             + ',' + str(diff) + '\n')
#     f.close()
