In [1]:
########################################################################################
# Analysis script that uses single textual source only for issue report classification #
# Section 5.2 Single-Source Approaches using Textual Information Only                  #
########################################################################################
import re
import pandas
import numpy as np
from time import gmtime, strftime, time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn import model_selection
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from stop_words import get_stop_words

In [2]:
# The issues have been downloaded and saved as a csv file. 
# The following are the column names in the input data file, change them according to your file specifications 
CNAME_TEAMCODE = "TEAMCODE"
CNAME_SUBJECT = "SUBJECT"
CNAME_DESCRIPTION = "DESCRIPTION"
CNAME_RECORD_TYPE = "RECORDTYPE"
CNAME_STATUS = "STATUS"
CNAME_YEAR_OPENED = "CREATIONYEAR" # year of creation of issue reports
CNAME_MONTH_OPENED = "CREATIONMONTH" # month of creation of issue reports
# filtering specifications 
FILTER_ISSUE_TYPE = 'ISSUE'
FILTER_ISSUE_STATUS = 'CLOSED'

In [3]:
def selectRecordsOpenedAtYearMonth(dataset, year, month):
    """
    returns the issue records belonging to the input year and month
    """
    d1 = dataset.groupby(CNAME_YEAR_OPENED).filter(lambda x: x.name == year)

    return d1.groupby(CNAME_MONTH_OPENED).filter(lambda x: x.name == month)

In [4]:
CNAME_SUBJECT_DESCRIPTION = "SUBJECT_DESCRIPTION" # concatenation of subject and description

def load(filePath):
    '''
    load the dataset, ISO-8859-9 encoding is used for characters specific to Turkish language.
    '''
    dataset = pandas.read_csv(filePath, encoding="ISO-8859-9", delimiter=";") 

    # remove the spaces from the start and end of column names
    dataset.rename(columns=lambda x: x.strip(), inplace=True)

    # concatenate subject and description in one column
    dataset[CNAME_SUBJECT_DESCRIPTION] = dataset[CNAME_SUBJECT].astype(str) + ' ' + dataset[CNAME_DESCRIPTION].astype(str)

    return dataset

In [5]:
def selectTrainingDatasetRecords(dataset, train_year, train_month_list):
    """
    filter issue records from the training dataset such that 
     * unresolved are eliminated and 
     * they are opened at specific time intervals

    """
    dataset = dataset[(dataset[CNAME_RECORD_TYPE] == FILTER_ISSUE_TYPE) &
                      (dataset[CNAME_STATUS] == FILTER_ISSUE_STATUS)]
    # # select year and month
    frames = []
    for train_month in train_month_list:
        frames.append(selectRecordsOpenedAtYearMonth(dataset, train_year, train_month))

    dataset = pandas.concat(frames)
    return dataset

In [6]:
def selectTestDatasetRecords(dataset, test_year, test_month):
    """
    filter issue records from the test dataset such that 
     * unresolved are eliminated and 
     * they are opened at specific time intervals

    """
    # select year and month
    frames = [selectRecordsOpenedAtYearMonth(dataset, test_year, test_month)]
    dataset = pandas.concat(frames)
    return dataset[(dataset[CNAME_RECORD_TYPE] == FILTER_ISSUE_TYPE) & 
                   (dataset[CNAME_STATUS] == FILTER_ISSUE_STATUS)]

In [7]:
# list of the stop words to be eliminated from the issue dataset 
stop_word_list = get_stop_words('turkish')
#print(stop_word_list)

# Turkish upper-case characters are lower-cased seperately so as to be sure of them
lower_map_turkish = {
    ord(u'I'): u'ı',
    ord(u'İ'): u'i',
    ord(u'Ç'): u'ç',
    ord(u'Ş'): u'ş',
    ord(u'Ö'): u'ö',
    ord(u'Ü'): u'ü',
    ord(u'Ğ'): u'ğ'
}

def filterNoise(text):
    """
    converts words to lowercase, eliminates non-alphanumeric characters, eliminates stop-words
    """
    # Remove all non-alphanumeric characters from the text via the regex[\W]+,
    # Convert the text into lowercase characters
    text_tr = text.translate(lower_map_turkish)
    lowerText = re.sub('[\W]+', ' ', text_tr.lower())

    #remove stopwords
    noStopWordsText = [word for word in lowerText.split() if word not in stop_word_list]

    return ' '.join(noStopWordsText)

In [8]:
inputFileName = input("Please enter the name of the csv dataset to read:")

Please enter the name of the csv dataset to read:issues.csv


In [9]:
# load the dataset
entireDataset = load(inputFileName)
print("Entire dataset length: " + str(len(entireDataset)))

Entire dataset length: 3650


In [10]:
train_year = int(input("Please enter the issue report year to be included in train dataset: "))
train_month_list = []
train_month = ""
while (train_month != "EXIT"):
    train_month = str(input("Please enter the issue report month to be included in train dataset (EXIT to stop): "))
    if (train_month != "EXIT"):
        train_month_list.append(train_month)

Please enter the issue report year to be included in train dataset: 2019
Please enter the issue report month to be included in train dataset (EXIT to stop): JANUARY
Please enter the issue report month to be included in train dataset (EXIT to stop): FEBRUARY
Please enter the issue report month to be included in train dataset (EXIT to stop): MARCH
Please enter the issue report month to be included in train dataset (EXIT to stop): EXIT


In [11]:
# filter training issue records
trainDataset = selectTrainingDatasetRecords(entireDataset, train_year, train_month_list)

3650
1709


In [12]:
# text preprocessing
trainDataset[CNAME_SUBJECT_DESCRIPTION] = trainDataset[CNAME_SUBJECT_DESCRIPTION].apply(filterNoise)

In [13]:
# print to check training records
print("Train dataset length : " + str(len(trainDataset)))
#print(trainDataset[CNAME_SUBJECT_DESCRIPTION].head(3))

Train dataset length : 1709


In [14]:
test_year = int(input("Please enter the issue report year to be included in test dataset: "))
test_month = str(input("Please enter the issue report month to be included in test dataset: "))

Please enter the issue report year to be included in test dataset: 2019
Please enter the issue report month to be included in test dataset: APRIL


In [15]:
# filter test issue records
testDataset = selectTestDatasetRecords(entireDataset, test_year, test_month)

In [16]:
#text preprocessing
testDataset[CNAME_SUBJECT_DESCRIPTION] = testDataset[CNAME_SUBJECT_DESCRIPTION].apply(filterNoise)

In [17]:
# print to check test records
print("Test length: " + str(len(testDataset)))
#print(testDataset[CNAME_SUBJECT_DESCRIPTION].head(3))

Test length: 466


In [18]:
# specify the input textual data to train (X_train) and related classes (Y_train)
X_train = trainDataset[CNAME_SUBJECT_DESCRIPTION].values
Y_train = trainDataset[CNAME_TEAMCODE].values

In [19]:
vectorizer_sd = TfidfVectorizer(ngram_range=(1,2), min_df=2)
train_vectors_sd = vectorizer_sd.fit_transform(X_train)
voc_sd = vectorizer_sd.vocabulary_

In [20]:
# specify the input textual data to test (X_test) and related classes (Y_test)
X_test = testDataset[CNAME_SUBJECT_DESCRIPTION].values
Y_test = testDataset[CNAME_TEAMCODE].values

In [21]:
# Td-idf conversion for test dataset
vectorizer_test_sd = TfidfVectorizer(ngram_range=(1, 2), vocabulary=voc_sd)
X_tfidf_test = vectorizer_test_sd.fit_transform(X_test)

In [22]:
Svm = LinearSVC()
cSvm = CalibratedClassifierCV(Svm)
cSvm.fit(train_vectors_sd, Y_train)



CalibratedClassifierCV(base_estimator=LinearSVC())

In [23]:
predictions = cSvm.predict(X_tfidf_test)
print("#################################################")
print("################ SVM_t_1 Results ################")
print("#################################################")
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions, digits=5))

#################################################
################ SVM_t_1 Results ################
#################################################
0.7274678111587983
[[2 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 7 ... 0 0 0]
 ...
 [0 0 0 ... 3 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
              precision    recall  f1-score   support

      UK0020    0.40000   1.00000   0.57143         2
      UK0025    0.50000   0.25000   0.33333         4
      UK0038    0.58333   0.70000   0.63636        10
      UK0066    0.00000   0.00000   0.00000         1
      UK0074    0.00000   0.00000   0.00000         0
      UK0100    0.00000   0.00000   0.00000         5
      UK0110    0.96552   0.93333   0.94915        30
      UK0125    1.00000   0.33333   0.50000         3
      UK0137    0.71429   0.83333   0.76923        18
      UK0167    1.00000   0.50000   0.66667         2
      UK0169    0.00000   0.00000   0.00000         1
      UK0172    0.00000   0.00000   0.00000         0
      UK

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, concatenate;
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator, TransformerMixin
from keras.utils import np_utils
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [49]:
def get_model_1Channel(maxlen, max_features, embedding_dims, class_num, last_activation):
    ###########################################################
    # Returns the CNN model with single channel               #
    # maxlen: Length of input sequences, when it is constant. #
    #         This argument is required if you are going to   #
    #         connect Flatten then Dense layers upstream      #
    # max_features: size of the vocabulary                    #
    # embedding_dims: dimension of the dense embedding        #
    # class_num: number of classes (dimension of the keras    #
    #         Dense output layer)                             #
    # last_activation: Activation function to use for the     #
    #         keras Dense layer                               #
    ###########################################################
    input = Input((maxlen,))
    # Word embeddings
    embedding = Embedding(max_features, embedding_dims, 
                          input_length=maxlen, trainable=True)(input)
    # Convolutional layer and Max Pooling
    convs = []
    for kernel_size in [2, 3, 4]:
        c = Conv1D(128, kernel_size, activation='relu')(embedding)
        c = Dropout(0.5)(c)
        c = GlobalMaxPooling1D()(c)
        convs.append(c)
    x = Concatenate()(convs)
    # Fully connected layer
    output = Dense(class_num, activation=last_activation)(x)
    model = Model(inputs=input, outputs=output)
    return model

In [50]:
class TextTransformer(BaseEstimator, TransformerMixin):
    # Class that transforma text data before CNN classification
    def __init__(self): 
        # max_features: 
        self.max_features = 50000  # the maximum number of words to keep, based on word frequency.
        self.max_length = 400      # maximum length of all sequences
        # create the tokenizer
        self.tokenizer = Tokenizer(num_words=self.max_features)
        
    def fit(self, X_train):
        # fit the tokenizer on the documents
        self.tokenizer.fit_on_texts(X_train)
            
    def transform(self, X_test):
        # sequence encode
        encoded_docs = self.tokenizer.texts_to_sequences(X_test)
        # pad sequences
        X_test_padded = pad_sequences(encoded_docs, maxlen=self.max_length, padding='post')

        return X_test_padded

In [51]:
train_tags = trainDataset[CNAME_TEAMCODE]
test_tags = testDataset[CNAME_TEAMCODE]
df = pandas.concat([trainDataset[CNAME_TEAMCODE], testDataset[CNAME_TEAMCODE]], axis=0)
num_classes = df.nunique()
print(num_classes)

# Encode target labels with value between 0 and n_classes-1
encoder = LabelEncoder()
encoder.fit(df)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

# Convert class vector (integers) to binary class matrix.
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)
#print(y_train[0])
#print(y_test[0])

122


In [54]:
X_train = trainDataset[CNAME_SUBJECT_DESCRIPTION].values
vectorizer = TextTransformer()
vectorizer.fit(X_train)

X_test = testDataset[CNAME_SUBJECT_DESCRIPTION].values

# pad sequences
X_train_padded = vectorizer.transform(X_train)
X_test_padded = vectorizer.transform(X_test)

# define vocabulary size (largest integer value)
vocab_size = len(vectorizer.tokenizer.word_index) + 1
print(vocab_size)

embedding_dims = 300
max_length = 400
last_activation='softmax'
batch_size = 32
epochs = 2

13330


In [56]:
model_1channel = get_model_1Channel(max_length, vocab_size, embedding_dims, 
                                    class_num=num_classes, last_activation=last_activation)
model_1channel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_1channel.fit(X_train_padded, y_train, 
                   batch_size=batch_size, 
                   epochs=epochs, 
                   validation_data=(X_test_padded, y_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1d897d21e80>

In [57]:
pred_proba = model_1channel.predict(X_test_padded)
print("#################################################")
print("################ CNN_t_1 Results ################")
print("#################################################")

predictions = []
for proba in pred_proba:
    proba_list = list(proba)
    index = proba_list.index(max(proba_list))
    predictions.append(index)

Y_validation = []
for y in y_test:
    y_list = list(y)
    index = y_list.index(max(y_list))
    Y_validation.append(index)

print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions, digits=5))

#################################################
################ CNN_t_1 Results ################
#################################################
0.5
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         2
           1    0.00000   0.00000   0.00000         4
           2    0.00000   0.00000   0.00000        10
           4    0.00000   0.00000   0.00000         1
           6    0.00000   0.00000   0.00000         5
           8    0.82353   0.93333   0.87500        30
           9    0.00000   0.00000   0.00000         3
          10    0.73684   0.77778   0.75676        18
          12    0.00000   0.00000   0.00000         2
          13    0.00000   0.00000   0.00000         1
          16    0.00000   0.00000   0.00000         7
          17    0.00000   0.00000   0.00000         6
          18    0.00000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
