In [None]:
import tensorflow as tf
from tensorflow import keras
import os
import tempfile
import pprint
import math
import random

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.feature_extraction.text import CountVectorizer

from joblib import dump, load

from utils.preprocess_data import remove_time_stamp

#from utils.queryDB import queryDBForTestOutput
# from preprocessData import generateFeatureWordsList, generateLabelsIndex, \
#                             preprocessDBData, shuffleDataLabelPairs, \
#                             padding_processed_data, preprocessTFIDFData


In [None]:
train_data_path = 'data/GitHubData'
train_data = []
train_label = []

train_files = os.listdir(train_data_path)
for train_file in train_files:
    if (train_file.endswith('txt')):
        train_label.append(train_file)
        with open(os.path.join(train_data_path, train_file)) as f:
            train_data.append(f.read())


In [None]:
# tmp increase train data by multiplication
increase_times = 8
train_data *= increase_times
train_label *= increase_times

In [None]:
df_git = pd.DataFrame([train_data, train_label]).T
df_git.columns= ['git_issue_content', 'labels']
df_git.head()

In [None]:
# pre-process git data
df_git['git_issue_content'] = [remove_time_stamp(content) for content in df_git['git_issue_content']]
df_git.head()

In [None]:
test_data_path = 'data/JenkinsData'
test_data = []
test_label = []

test_files = os.listdir(test_data_path)
for test_file in test_files:
    if (test_file.endswith('txt')):
        test_label.append(test_file)
        with open(os.path.join(test_data_path, test_file)) as f:
            test_data.append(f.read())

df_jenkins = pd.DataFrame([test_data, test_label]).T
df_jenkins.columns= ['jenkins_content', 'labels']
df_jenkins.head()


In [None]:
# pre-process jenkins data
df_jenkins['jenkins_content'] = [remove_time_stamp(content) for content in df_jenkins['jenkins_content']]
df_jenkins.head()

In [None]:
# add train test matching in df

In [None]:
# df_git['clean_text'] = df_git['git_issue_content'].apply(process_text)
# df_git.head()

In [None]:
clean_df_git = df_git.copy()

def create_label_dict(labels, label_dict_name2num, label_dict_num2name):
    index = 0
    for label in labels:
        if label not in label_dict_name2num:
            label_dict_num2name[index] = label
            label_dict_name2num[label] = index
            index += 1

labels_git_name = clean_df_git.pop("labels") 
label_dict_name2num = dict()
label_dict_num2name = dict()
create_label_dict(labels_git_name, label_dict_name2num, label_dict_num2name)
labels_git_num = [label_dict_name2num[x] for x in labels_git_name]

print("label_dict_name2num is \n",label_dict_name2num, '\n')
print("label_dict_num2name is \n",label_dict_num2name, '\n')
print("labels_git_num is \n", labels_git_num, '\n')



In [None]:
clean_df_git.head()

In [None]:
# tfidf can add more parameter settings
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer_train_vectors = tfidf_vectorizer.fit_transform(clean_df_git.pop('git_issue_content'))

train_df = pd.DataFrame.sparse.from_spmatrix(tfidf_vectorizer_train_vectors, columns=tfidf_vectorizer.get_feature_names())
print("train data after tfidf: \n", train_df.head())


In [None]:
# add val when more data available
val_df = train_df

# test df is from jenkins
clean_df_jenkins = df_jenkins.copy()
labels_jenkins_name = clean_df_jenkins.pop("labels")
labels_jenkins_num = [label_dict_name2num[x] for x in labels_jenkins_name]
print("test labels list: ", labels_jenkins_num)
test_df = pd.DataFrame.sparse.from_spmatrix(tfidf_vectorizer.transform(clean_df_jenkins.pop('jenkins_content')), columns=tfidf_vectorizer.get_feature_names())
print("test data after tfidf: \n", test_df.head())

In [None]:
train_labels = np.array(labels_git_num)
# add val when more data available# add val when more data available
val_labels = np.array(labels_git_num)
test_labels = np.array(labels_jenkins_num)

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

In [None]:
# Normalize the input features using the sklearn StandardScaler. This will set the mean to 0 and standard deviation to 1.
standardScaler = StandardScaler()
train_features = standardScaler.fit_transform(train_features)
val_features = standardScaler.transform(val_features)
test_features = standardScaler.transform(test_features)

print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

In [None]:
#Preliminary models evaluation with default parameters

#Creating a dict of the several models
model_dict = {'Dummy Model' : DummyClassifier(random_state=3),
              'Stochastic Gradient Descent Model' : SGDClassifier(random_state=3, loss='log'),
              'Random Forest Model': RandomForestClassifier(random_state=3),
              'Decsision Tree Model': DecisionTreeClassifier(random_state=3),
              'AdaBoost Model': AdaBoostClassifier(random_state=3),
              'Gaussian Naive Bayes Model': GaussianNB(),
              'K Nearest Neighbor Model': KNeighborsClassifier()}

#Train test split with stratified sampling for evaluation
# X_train, X_test, y_train, y_test = train_features, val_features, train_labels, val_labels
X_train, X_test, y_train, y_test = train_features, test_features, train_labels, test_labels
# X_train, X_test, y_train, y_test = train_test_split(X, 
#                                                     y, 
#                                                     test_size = .3, 
#                                                     shuffle = True, 
#                                                     stratify = y, 
#                                                     random_state = 3)

#Function to get the scores for each model in a df
def model_scores_df(model_dict):   
    model_names, ac_scores_list, p_scores_list, r_scores_list, f1_scores_list = [], [], [], [], []
    for cur_model_name,cur_model in model_dict.items():   
        model_names.append(cur_model_name)
        cur_model.fit(X_train, y_train)
        label_pred = cur_model.predict(X_test)
        ac_scores_list.append(accuracy_score(y_test, label_pred))
        p_scores_list.append(precision_score(y_test, label_pred, average='macro'))
        r_scores_list.append(recall_score(y_test, label_pred, average='macro'))
        f1_scores_list.append(f1_score(y_test, label_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_names, ac_scores_list, p_scores_list, r_scores_list, f1_scores_list]).T
        model_comparison_df.columns = ['model_names', 'accuracy_scores', 'precision_scores', 'recall_scores', 'f1_scores']
        model_comparison_df = model_comparison_df.sort_values(by='f1_scores', ascending=False)
    return model_comparison_df

model_scores_df(model_dict)

In [None]:
# save trained model in joblib format
dump(model_dict["Random Forest Model"], 'MLModel.joblib')


In [None]:
# load and predict with saved model 
savedModel = load('MLModel.joblib')
savedModel_predictions = savedModel.predict(test_features)
print("savedModel Sample 0 predict label: ", savedModel_predictions[0])

In [None]:
predictions = model_dict["Random Forest Model"].predict(test_features)
predictions_proba = model_dict["Random Forest Model"].predict_proba(test_features)

# TF and sklearn have different functions for probability/confidence, below is sklearn
print("Sample 0 predict with probability: ", predictions_proba[0])
print("Sample 0 predict label: ", predictions[0])

print("Sample 1 predict with probability: ", predictions_proba[2])
print("Sample 1 predict label: ", predictions[2])
