In [1]:
###########################################################################
# Analysis script to compare ml algorithms for issue record classification#
#       (Appendix A Evaluating Existing Issue Assignment Approaches)      #
###########################################################################
import pandas
import numpy
import re
from stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from time import gmtime, strftime, time
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from mlxtend.classifier import StackingClassifier

In [2]:
# column names in the input data file, change them according to your file specifications 
# note that in our case the year and month of creation of the issue report is available as seperate columns
CNAME_TEAMCODE = "TEAMCODE"
CNAME_SUBJECT = "SUBJECT"
CNAME_DESCRIPTION = "DESCRIPTION"
CNAME_RECORD_TYPE = "RECORDTYPE"
CNAME_STATUS = "STATUS"
CNAME_YEAR_OPENED = "CREATIONYEAR" # year of creation of issue reports
CNAME_MONTH_OPENED = "CREATIONMONTH" # month of creation of issue reports
# filtering specifications 
FILTER_ISSUE_TYPE = 'ISSUE'
FILTER_ISSUE_STATUS = 'CLOSED'

In [3]:
def selectRecordsOpenedAtYearMonth(dataset, year, month):
    """
    returns the issue records belonging to the input year and month
    """
    d1 = dataset.groupby(CNAME_YEAR_OPENED).filter(lambda x: x.name == year)

    return d1.groupby(CNAME_MONTH_OPENED).filter(lambda x: x.name == month)

In [4]:
CNAME_SUBJECT_DESCRIPTION = "SUBJECT_DESCRIPTION" # concatenation of subject and description

def load(filePath):
    '''
    load the dataset, ISO-8859-9 encoding is used for characters specific to Turkish language.
    '''
    dataset = pandas.read_csv(filePath, encoding="ISO-8859-9", delimiter=";") 

    # remove the spaces from the start and end of column names
    dataset.rename(columns=lambda x: x.strip(), inplace=True)

    # concatenate subject and description in one column
    dataset[CNAME_SUBJECT_DESCRIPTION] = dataset[CNAME_SUBJECT].astype(str) + ' ' + dataset[CNAME_DESCRIPTION].astype(str)

    return dataset

In [5]:
def selectTrainingDatasetRecords(dataset, train_year, train_month_list):
    """
    filter issue records from the training dataset such that 
     * unresolved are eliminated and 
     * they are opened at specific time intervals

    """
    dataset = dataset[(dataset[CNAME_RECORD_TYPE] == FILTER_ISSUE_TYPE) &
                      (dataset[CNAME_STATUS] == FILTER_ISSUE_STATUS)]

    # # select year and month
    frames = []
    for train_month in train_month_list:
        frames.append(selectRecordsOpenedAtYearMonth(dataset, train_year, train_month))

    dataset = pandas.concat(frames)

    return dataset

In [6]:
def selectTestDatasetRecords(dataset, test_year, test_month):
    """
    filter issue records from the test dataset such that 
     * unresolved are eliminated and 
     * they are opened at specific time intervals

    """
    # select year and month
    frames = [selectRecordsOpenedAtYearMonth(dataset, test_year, test_month)]
    dataset = pandas.concat(frames)
    return dataset[(dataset[CNAME_RECORD_TYPE] == FILTER_ISSUE_TYPE) & 
                   (dataset[CNAME_STATUS] == FILTER_ISSUE_STATUS)]

In [7]:
def selectRecordsHavingAtLeastNValuesInColumn(dataset, columnName, min_number_of_distinct_values):
    """
    returns records that have the same value at columnName at least N times 
    """

    return dataset.groupby(columnName).filter(lambda x: len(x) >= min_number_of_distinct_values)

In [8]:
# list of the stop words to be eliminated from the issue dataset 
stop_word_list = get_stop_words('turkish')
#print(stop_word_list)

# Turkish upper-case characters are lower-cased seperately so as to be sure of them
lower_map_turkish = {
    ord(u'I'): u'ı',
    ord(u'İ'): u'i',
    ord(u'Ç'): u'ç',
    ord(u'Ş'): u'ş',
    ord(u'Ö'): u'ö',
    ord(u'Ü'): u'ü',
    ord(u'Ğ'): u'ğ'
}

def filterNoise(text):
    """
    converts words to lowercase, eliminates non-alphanumeric characters, eliminates stop-words
    """
    # Remove all non-alphanumeric characters from the text via the regex[\W]+,
    # Convert the text into lowercase characters
    text_tr = text.translate(lower_map_turkish)
    lowerText = re.sub('[\W]+', ' ', text_tr.lower())

    #remove stopwords
    noStopWordsText = [word for word in lowerText.split() if word not in stop_word_list]

    return ' '.join(noStopWordsText)

In [9]:
inputFileName = input("Please enter the name of the csv dataset to read:")

Please enter the name of the csv dataset to read:issues.csv


In [10]:
# load the dataset
entireDataset = load(inputFileName)
print("Entire dataset length: " + str(len(entireDataset)))

Entire dataset length: 122750


In [11]:
train_year = int(input("Please enter the issue report year to be included in train dataset: "))
train_month_list = []
train_month = ""
while (train_month != "EXIT"):
    train_month = str(input("Please enter the issue report month to be included in train dataset (EXIT to stop): "))
    if (train_month != "EXIT"):
        train_month_list.append(train_month)

Please enter the issue report year to be included in train dataset: 2017
Please enter the issue report month to be included in train dataset (EXIT to stop): OCTOBER
Please enter the issue report month to be included in train dataset (EXIT to stop): NOVEMBER
Please enter the issue report month to be included in train dataset (EXIT to stop): EXIT


In [12]:
# the issue assigned to a team should occur at least min_number_of_discinct_values times for training
min_number_of_distinct_values = int(input("Please enter the minimum number of times a team should occur in the train dataset: "))

Please enter the minimum number of times a team should occur in the train dataset: 10


In [13]:
# filter training issue records
trainDataset = selectTrainingDatasetRecords(entireDataset, train_year, train_month_list)
trainDataset = selectRecordsHavingAtLeastNValuesInColumn(trainDataset, CNAME_TEAMCODE, min_number_of_distinct_values)

In [14]:
# text preprocessing
trainDataset[CNAME_SUBJECT_DESCRIPTION] = trainDataset[CNAME_SUBJECT_DESCRIPTION].apply(filterNoise)

In [15]:
# print to check training records
print("Train dataset length : " + str(len(trainDataset)))
#print(trainDataset[CNAME_SUBJECT_DESCRIPTION].head(3))

Train dataset length : 13569


In [16]:
test_year = int(input("Please enter the issue report year to be included in test dataset: "))
test_month = str(input("Please enter the issue report month to be included in test dataset: "))

Please enter the issue report year to be included in test dataset: 2017
Please enter the issue report month to be included in test dataset: DECEMBER


In [17]:
# filter test issue records
testDataset = selectTestDatasetRecords(entireDataset, test_year, test_month)

In [18]:
#text preprocessing
testDataset[CNAME_SUBJECT_DESCRIPTION] = testDataset[CNAME_SUBJECT_DESCRIPTION].apply(filterNoise)

In [19]:
# print to check test records
print("Test length: " + str(len(testDataset)))
#print(testDataset[CNAME_SUBJECT_DESCRIPTION].head(3))

Test length: 7243


In [20]:
# specify the input textual data to train (X_train) and related classes (Y_train)
X_train = trainDataset[CNAME_SUBJECT_DESCRIPTION].values
Y_train = trainDataset[CNAME_TEAMCODE].values

In [21]:
# Tf-idf conversion for training dataset
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_train = vectorizer.fit_transform(X_train)
voc = vectorizer.vocabulary_

In [22]:
# specify the input textual data to test (X_test) and related classes (Y_test)
X_test = testDataset[CNAME_SUBJECT_DESCRIPTION].values
Y_test = testDataset[CNAME_TEAMCODE].values

In [23]:
# Td-idf conversion for test dataset
vectorizer = TfidfVectorizer(ngram_range=(1, 2), vocabulary=voc)
X_tfidf_test = vectorizer.fit_transform(X_test)

In [24]:
# Specify the algorithms
#
MultNB = MultinomialNB()
#
DT = DecisionTreeClassifier()
#
Knn = KNeighborsClassifier(n_neighbors=12, algorithm='brute', metric='cosine')
#
LR = LogisticRegression()
#
RF = RandomForestClassifier()
#
LinSvc = LinearSVC()
CLinSvc = CalibratedClassifierCV(LinSvc)

SclfBest_3 = StackingClassifier(classifiers=[CLinSvc, LR, Knn],
                                use_probas=True,
                                meta_classifier=LR)

SclfSelect_3 = StackingClassifier(classifiers=[CLinSvc, Knn, MultNB],
                                  use_probas=True,
                                  meta_classifier=LR)

SclfBest_5 = StackingClassifier(classifiers=[CLinSvc, LR, Knn, RF, DT],
                                use_probas=True,
                                meta_classifier=LR)

SclfSelect_5 = StackingClassifier(classifiers=[CLinSvc, LR, Knn, RF, MultNB],
                                  use_probas=True, 
                                  meta_classifier=LR)

In [25]:
for clf, label in zip([MultNB, Knn, LR, LinSvc, CLinSvc, DT, RF, SclfBest_3, SclfSelect_3, SclfBest_5, SclfSelect_5],
                      ['Multinomial NB',
                       'KNN',
                       'Logistic Regression',
                       'Linear SVC',
                       'Linear SVC - Calibrated',
                       'Decision Tree',
                       'Random Forest',
                       'Best3',
                       'Selected3',
                       'Best5',
                       'Selected5']):
    print(label + ":Training starts:" + strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    start_time = time()

    scores = model_selection.cross_val_score(clf, X_tfidf_train, Y_train, cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"
          % (scores.mean(), scores.std(), label))

    clf.fit(X_tfidf_train, Y_train)
    end_time = time()
    print(label + ":Training ends:" + strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    total_seconds = end_time - start_time
    hours, rest = divmod(total_seconds, 3600)
    minutes, seconds = divmod(rest, 60)
    print("Training time: ", hours, minutes, seconds)
    #
    predictions = clf.predict(X_tfidf_test)
    print(accuracy_score(Y_test, predictions))
    print(confusion_matrix(Y_test, predictions))
    print(classification_report(Y_test, predictions))