In [1]:
###########################################################################
# Analysis script to create explanation for a specific input issue record.#
#       (Section 5 Explaining Team Assignments)                           #
##########################################################################
import pandas
import numpy
import re
from stop_words import get_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

In [2]:
# column names in the input data file, change them according to your file specifications 
# note that in our case the year and month of creation of the issue report is available as seperate columns
CNAME_TEAMCODE = "UYGULAMAKODU"
CNAME_SUBJECT = "OZETBASLIK"
CNAME_DESCRIPTION = "ACIKLAMA"
CNAME_RECORD_TYPE = "KAYITTIPI"
CNAME_STATUS = "COZUM"
CNAME_YEAR_OPENED = "OLUSTURULDUYIL" # year of creation of issue reports
CNAME_MONTH_OPENED = "OLUSTURULDUAY" # month of creation of issue reports
# filtering specifications 
FILTER_ISSUE_TYPE = 'Olay'
FILTER_ISSUE_STATUS = 'Tamamlandı'

In [3]:
def selectRecordsOpenedAtYearMonth(dataset, year, month):
    """
    returns the issue records belonging to the input year and month
    """
    d1 = dataset.groupby(CNAME_YEAR_OPENED).filter(lambda x: x.name == year)

    return d1.groupby(CNAME_MONTH_OPENED).filter(lambda x: x.name == month)

In [4]:
CNAME_SUBJECT_DESCRIPTION = "OZETBASLIK_ACIKLAMA" # concatenation of subject and description

def load(filePath):
    '''
    load the dataset, ISO-8859-9 encoding is used for characters specific to Turkish language.
    '''
    dataset = pandas.read_csv(filePath, encoding="ISO-8859-9", delimiter=";") 

    # remove the spaces from the start and end of column names
    dataset.rename(columns=lambda x: x.strip(), inplace=True)

    # concatenate subject and description in one column
    dataset[CNAME_SUBJECT_DESCRIPTION] = dataset[CNAME_SUBJECT].astype(str) + ' ' + dataset[CNAME_DESCRIPTION].astype(str)

    return dataset

In [5]:
def selectTrainingDatasetRecords(dataset, train_year, train_month_list):
    """
    filter issue records from the training dataset such that 
     * unresolved are eliminated and 
     * they are opened at specific time intervals

    """
    dataset = dataset[(dataset[CNAME_RECORD_TYPE] == FILTER_ISSUE_TYPE) &
                      (dataset[CNAME_STATUS] == FILTER_ISSUE_STATUS)]

    # # select year and month
    frames = []
    for train_month in train_month_list:
        frames.append(selectRecordsOpenedAtYearMonth(dataset, train_year, train_month))

    dataset = pandas.concat(frames)

    return dataset

In [6]:
def selectTestDatasetRecords(dataset, test_year, test_month):
    """
    filter issue records from the test dataset such that 
     * unresolved are eliminated and 
     * they are opened at specific time intervals

    """
    # select year and month
    frames = [selectRecordsOpenedAtYearMonth(dataset, test_year, test_month)]
    dataset = pandas.concat(frames)
    return dataset[(dataset[CNAME_RECORD_TYPE] == FILTER_ISSUE_TYPE) & 
                   (dataset[CNAME_STATUS] == FILTER_ISSUE_STATUS)]

In [7]:
def selectRecordsHavingAtLeastNValuesInColumn(dataset, columnName, min_number_of_distinct_values):
    """
    returns records that have the same value at columnName at least N times 
    """

    return dataset.groupby(columnName).filter(lambda x: len(x) >= min_number_of_distinct_values)

In [8]:
# list of the stop words to be eliminated from the issue dataset 
stop_word_list = get_stop_words('turkish')
#print(stop_word_list)

# Turkish upper-case characters are lower-cased seperately so as to be sure of them
lower_map_turkish = {
    ord(u'I'): u'ı',
    ord(u'İ'): u'i',
    ord(u'Ç'): u'ç',
    ord(u'Ş'): u'ş',
    ord(u'Ö'): u'ö',
    ord(u'Ü'): u'ü',
    ord(u'Ğ'): u'ğ'
}

def filterNoise(text):
    """
    converts words to lowercase, eliminates non-alphanumeric characters, eliminates stop-words
    """
    # Remove all non-alphanumeric characters from the text via the regex[\W]+,
    # Convert the text into lowercase characters
    text_tr = text.translate(lower_map_turkish)
    lowerText = re.sub('[\W]+', ' ', text_tr.lower())

    #remove stopwords
    noStopWordsText = [word for word in lowerText.split() if word not in stop_word_list]

    return ' '.join(noStopWordsText)

In [9]:
inputFileName = input("Please enter the name of the csv dataset to read:")

Please enter the name of the csv dataset to read:issues.csv


In [10]:
# load the dataset
entireDataset = load(inputFileName)
print("Entire dataset length: " + str(len(entireDataset)))

Entire dataset length: 122750


In [11]:
train_year = int(input("Please enter the issue report year to be included in train dataset: "))
train_month_list = []
train_month = ""
while (train_month != "EXIT"):
    train_month = str(input("Please enter the issue report month to be included in train dataset (EXIT to stop): "))
    if (train_month != "EXIT"):
        train_month_list.append(train_month)

Please enter the issue report year to be included in train dataset: 2017
Please enter the issue report month to be included in train dataset (EXIT to stop): HAZİRAN
Please enter the issue report month to be included in train dataset (EXIT to stop): TEMMUZ
Please enter the issue report month to be included in train dataset (EXIT to stop): AĞUSTOS
Please enter the issue report month to be included in train dataset (EXIT to stop): EYLÜL
Please enter the issue report month to be included in train dataset (EXIT to stop): EKİM
Please enter the issue report month to be included in train dataset (EXIT to stop): KASIM
Please enter the issue report month to be included in train dataset (EXIT to stop): EXIT


In [12]:
# the issue assigned to a team should occur at least min_number_of_discinct_values times for training
min_number_of_distinct_values = int(input("Please enter the minimum number of times a team should occur in the train dataset: "))

Please enter the minimum number of times a team should occur in the train dataset: 10


In [13]:
# filter training issue records
trainDataset = selectTrainingDatasetRecords(entireDataset, train_year, train_month_list)
trainDataset = selectRecordsHavingAtLeastNValuesInColumn(trainDataset, CNAME_TEAMCODE, min_number_of_distinct_values)

In [14]:
# text preprocessing
trainDataset[CNAME_SUBJECT_DESCRIPTION] = trainDataset[CNAME_SUBJECT_DESCRIPTION].apply(filterNoise)

In [15]:
# print to check training records
print("Train dataset length : " + str(len(trainDataset)))
#print(trainDataset[CNAME_SUBJECT_DESCRIPTION].head(3))

Train dataset length : 36088


In [16]:
test_year = int(input("Please enter the issue report year to be included in test dataset: "))
test_month = str(input("Please enter the issue report month to be included in test dataset: "))

Please enter the issue report year to be included in test dataset: 2017
Please enter the issue report month to be included in test dataset: ARALIK


In [17]:
# filter test issue records
testDataset = selectTestDatasetRecords(entireDataset, test_year, test_month)

In [18]:
#text preprocessing
testDataset[CNAME_SUBJECT_DESCRIPTION] = testDataset[CNAME_SUBJECT_DESCRIPTION].apply(filterNoise)

In [19]:
# print to check test records
print("Test length: " + str(len(testDataset)))
#print(testDataset[CNAME_SUBJECT_DESCRIPTION].head(3))

Test length: 7243


In [20]:
# specify the input textual data to train (X_train) and related classes (Y_train)
X_train = trainDataset[CNAME_SUBJECT_DESCRIPTION].values
Y_train = trainDataset[CNAME_TEAMCODE].values

In [21]:
# Tf-idf conversion for training dataset
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf_train = vectorizer.fit_transform(X_train)
voc = vectorizer.vocabulary_

In [22]:
# specify the input textual data to test (X_test) and related classes (Y_test)
X_test = testDataset[CNAME_SUBJECT_DESCRIPTION].values
Y_test = testDataset[CNAME_TEAMCODE].values

In [23]:
# Td-idf conversion for test dataset
vectorizer = TfidfVectorizer(ngram_range=(1, 2), vocabulary=voc)
X_tfidf_test = vectorizer.fit_transform(X_test)

In [24]:
# Specify the algorithms
LinSvc = LinearSVC()
CLinSvc = CalibratedClassifierCV(LinSvc)

In [25]:
# fit the model
CLinSvc.fit(X_tfidf_train, Y_train)



CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None,
                                                dual=True, fit_intercept=True,
                                                intercept_scaling=1,
                                                loss='squared_hinge',
                                                max_iter=1000,
                                                multi_class='ovr', penalty='l2',
                                                random_state=None, tol=0.0001,
                                                verbose=0),
                       cv='warn', method='sigmoid')

In [26]:
# Explain the instances
c = make_pipeline(vectorizer, CLinSvc)
class_names = c.classes_
explainer = LimeTextExplainer(class_names=class_names)
top_n_labels = 3 # Top n recommendations to explain

testIssueNumber = int(input("Please enter the row number of the issue to be explained: "))
exp = explainer.explain_instance(X_test[testIssueNumber], c.predict_proba, num_features=6, top_labels=top_n_labels)

# starting from the best prediction to nth, explain the prediction
for i in range(0, top_n_labels):
    # The following line will plot the explanation
    exp.as_pyplot_figure(label=exp.available_labels()[i])

Please enter the row number of the issue to be explained: 0
