In [1]:
########################################################################################
# Analysis script that uses visual source only for issue report classification         #
# Section 5.3 Single-Source Approaches using Attachments Only                          #
########################################################################################
import re
import pandas
import numpy as np
from time import gmtime, strftime, time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn import model_selection
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from stop_words import get_stop_words

In [2]:
# The issues have been downloaded and saved as a csv file. 
# The following are the column names in the input data file, change them according to your file specifications 
CNAME_TEAMCODE = "TEAMCODE"
CNAME_ATTACHMENT_TEXT = "OCR_DATA" # The images have been processed previously and results are stored in this column.
CNAME_RECORD_TYPE = "RECORDTYPE"
CNAME_STATUS = "STATUS"
CNAME_YEAR_OPENED = "CREATIONYEAR" # year of creation of issue reports
CNAME_MONTH_OPENED = "CREATIONMONTH" # month of creation of issue reports
# filtering specifications 
FILTER_ISSUE_TYPE = 'ISSUE'
FILTER_ISSUE_STATUS = 'CLOSED'

In [3]:
def selectRecordsOpenedAtYearMonth(dataset, year, month):
    """
    returns the issue records belonging to the input year and month
    """
    d1 = dataset.groupby(CNAME_YEAR_OPENED).filter(lambda x: x.name == year)

    return d1.groupby(CNAME_MONTH_OPENED).filter(lambda x: x.name == month)

In [4]:
def load(filePath):
    '''
    load the dataset, ISO-8859-9 encoding is used for characters specific to Turkish language.
    '''
    dataset = pandas.read_csv(filePath, encoding="ISO-8859-9", delimiter=";") 

    # remove the spaces from the start and end of column names
    dataset.rename(columns=lambda x: x.strip(), inplace=True)

    return dataset

In [5]:
def selectTrainingDatasetRecords(dataset, train_year, train_month_list):
    """
    filter issue records from the training dataset such that 
     * unresolved are eliminated and 
     * they are opened at specific time intervals

    """
    dataset = dataset[(dataset[CNAME_RECORD_TYPE] == FILTER_ISSUE_TYPE) &
                      (dataset[CNAME_STATUS] == FILTER_ISSUE_STATUS)]
    # # select year and month
    frames = []
    for train_month in train_month_list:
        frames.append(selectRecordsOpenedAtYearMonth(dataset, train_year, train_month))

    dataset = pandas.concat(frames)
    return dataset

In [6]:
def selectTestDatasetRecords(dataset, test_year, test_month):
    """
    filter issue records from the test dataset such that 
     * unresolved are eliminated and 
     * they are opened at specific time intervals

    """
    # select year and month
    frames = [selectRecordsOpenedAtYearMonth(dataset, test_year, test_month)]
    dataset = pandas.concat(frames)
    return dataset[(dataset[CNAME_RECORD_TYPE] == FILTER_ISSUE_TYPE) & 
                   (dataset[CNAME_STATUS] == FILTER_ISSUE_STATUS)]

In [7]:
# list of the stop words to be eliminated from the issue dataset 
# you can use your own stop-words instead of this list.
stop_word_list = get_stop_words('turkish')
#print(stop_word_list)

# Turkish upper-case characters are lower-cased seperately so as to be sure of them
lower_map_turkish = {
    ord(u'I'): u'ı',
    ord(u'İ'): u'i',
    ord(u'Ç'): u'ç',
    ord(u'Ş'): u'ş',
    ord(u'Ö'): u'ö',
    ord(u'Ü'): u'ü',
    ord(u'Ğ'): u'ğ'
}

def filterNoise(text):
    """
    converts words to lowercase, eliminates non-alphanumeric characters, eliminates stop-words
    """
    # Remove all non-alphanumeric characters from the text via the regex[\W]+,
    # Convert the text into lowercase characters
    try:
        text_tr = text.translate(lower_map_turkish)
        lowerText = re.sub('[\W]+', ' ', text_tr.lower())
    except AttributeError:
        lowerText = ""
    

    #remove stopwords
    noStopWordsText = [word for word in lowerText.split() if word not in stop_word_list]

    return ' '.join(noStopWordsText)

In [8]:
inputFileName = input("Please enter the name of the csv dataset to read:")

In [9]:
# load the dataset
entireDataset = load(inputFileName)
print("Entire dataset length: " + str(len(entireDataset)))

In [10]:
train_year = int(input("Please enter the issue report year to be included in train dataset: "))
train_month_list = []
train_month = ""
while (train_month != "EXIT"):
    train_month = str(input("Please enter the issue report month to be included in train dataset (EXIT to stop): "))
    if (train_month != "EXIT"):
        train_month_list.append(train_month)

In [11]:
# filter training issue records
trainDataset = selectTrainingDatasetRecords(entireDataset, train_year, train_month_list)
#print(trainDataset[CNAME_ATTACHMENT_TEXT].head(30))

In [12]:
# text preprocessing
trainDataset[CNAME_ATTACHMENT_TEXT] = trainDataset[CNAME_ATTACHMENT_TEXT].apply(filterNoise)

In [13]:
# print to check training records
print("Train dataset length : " + str(len(trainDataset)))
#print(trainDataset[CNAME_ATTACHMENT_TEXT].head(3))

In [14]:
test_year = int(input("Please enter the issue report year to be included in test dataset: "))
test_month = str(input("Please enter the issue report month to be included in test dataset: "))

In [15]:
# filter test issue records
testDataset = selectTestDatasetRecords(entireDataset, test_year, test_month)

In [16]:
#text preprocessing
testDataset[CNAME_ATTACHMENT_TEXT] = testDataset[CNAME_ATTACHMENT_TEXT].apply(filterNoise)

In [17]:
# print to check test records
print("Test length: " + str(len(testDataset)))
#print(testDataset[CNAME_ATTACHMENT_TEXT].head(3))

In [18]:
# specify the input textual data to train (X_train) and related classes (Y_train)
X_train = trainDataset[CNAME_ATTACHMENT_TEXT].values
Y_train = trainDataset[CNAME_TEAMCODE].values

In [19]:
vectorizer_a = TfidfVectorizer(ngram_range=(1,1), min_df=100, max_df=0.5)
train_vectors_a = vectorizer_a.fit_transform(X_train)
voc_a = vectorizer_a.vocabulary_

In [20]:
# specify the input textual data to test (X_test) and related classes (Y_test)
X_test = testDataset[CNAME_ATTACHMENT_TEXT].values
Y_test = testDataset[CNAME_TEAMCODE].values

In [21]:
# Td-idf conversion for test dataset
vectorizer_test_a = TfidfVectorizer(ngram_range=(1,1), min_df=100, max_df=0.5, vocabulary=voc_a)
X_tfidf_test = vectorizer_test_a.fit_transform(X_test)

In [22]:
#################################################
#       Test and Train with SVM_a_1  Model      #
#################################################

# Train with the LinearSVC model, calibrate to get probability results. 
Svm = LinearSVC()
cSvm = CalibratedClassifierCV(Svm)
cSvm.fit(train_vectors_a, Y_train)

In [23]:
# prediction results
predictions = cSvm.predict(X_tfidf_test)
print("#################################################")
print("################ SVM_a_1 Results ################")
print("#################################################")
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions, digits=5))

In [24]:
#################################################
#       Test and Train with CNN_a_1  Model      #
#################################################

from keras import Input, Model
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, concatenate;
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator, TransformerMixin
from keras.utils import np_utils
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [25]:
def get_model_1Channel(maxlen, max_features, embedding_dims, class_num, last_activation):
    ###########################################################
    # Returns the CNN model with single channel               #
    # maxlen: Length of input sequences, when it is constant. #
    #         This argument is required if you are going to   #
    #         connect Flatten then Dense layers upstream      #
    # max_features: size of the vocabulary                    #
    # embedding_dims: dimension of the dense embedding        #
    # class_num: number of classes (dimension of the keras    #
    #         Dense output layer)                             #
    # last_activation: Activation function to use for the     #
    #         keras Dense layer                               #
    ###########################################################
    input = Input((maxlen,))
    # Word embeddings
    embedding = Embedding(max_features, embedding_dims, 
                          input_length=maxlen, trainable=True)(input)
    # Convolutional layer and Max Pooling
    convs = []
    for kernel_size in [2, 3, 4]:
        c = Conv1D(128, kernel_size, activation='relu')(embedding)
        c = Dropout(0.5)(c)
        c = GlobalMaxPooling1D()(c)
        convs.append(c)
    x = Concatenate()(convs)
    # Fully connected layer
    output = Dense(class_num, activation=last_activation)(x)
    model = Model(inputs=input, outputs=output)
    return model

In [26]:
class TextTransformer(BaseEstimator, TransformerMixin):
    # Class that transforma text data before CNN classification
    def __init__(self): 
        # max_features: 
        self.max_features = 50000  # the maximum number of words to keep, based on word frequency.
        self.max_length = 400      # maximum length of all sequences
        # create the tokenizer
        self.tokenizer = Tokenizer(num_words=self.max_features)
        
    def fit(self, X_train):
        # fit the tokenizer on the documents
        self.tokenizer.fit_on_texts(X_train)
            
    def transform(self, X_test):
        # sequence encode
        encoded_docs = self.tokenizer.texts_to_sequences(X_test)
        # pad sequences
        X_test_padded = pad_sequences(encoded_docs, maxlen=self.max_length, padding='post')

        return X_test_padded

In [27]:
train_tags = trainDataset[CNAME_TEAMCODE]
test_tags = testDataset[CNAME_TEAMCODE]
df = pandas.concat([trainDataset[CNAME_TEAMCODE], testDataset[CNAME_TEAMCODE]], axis=0)
num_classes = df.nunique()
print(num_classes)

# Encode target labels with value between 0 and n_classes-1
encoder = LabelEncoder()
encoder.fit(df)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

# Convert class vector (integers) to binary class matrix.
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)
#print(y_train[0])
#print(y_test[0])

In [28]:
X_train = trainDataset[CNAME_ATTACHMENT_TEXT].values
vectorizer = TextTransformer()
vectorizer.fit(X_train)

X_test = testDataset[CNAME_ATTACHMENT_TEXT].values

# pad sequences
X_train_padded = vectorizer.transform(X_train)
X_test_padded = vectorizer.transform(X_test)

# define vocabulary size (largest integer value)
vocab_size = len(vectorizer.tokenizer.word_index) + 1
print(vocab_size)

# define vocabulary size (largest integer value)
embedding_dims = 300
max_length = 400
last_activation='softmax'
batch_size = 32
epochs = 2

In [29]:
model_1channel = get_model_1Channel(max_length, vocab_size, embedding_dims, 
                                    class_num=num_classes, last_activation=last_activation)
model_1channel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_1channel.fit(X_train_padded, y_train, 
                   batch_size=batch_size, 
                   epochs=epochs, 
                   validation_data=(X_test_padded, y_test))

In [30]:
pred_proba = model_1channel.predict(X_test_padded)
print("#################################################")
print("################ CNN_a_1 Results ################")
print("#################################################")

predictions = []
for proba in pred_proba:
    proba_list = list(proba)
    index = proba_list.index(max(proba_list))
    predictions.append(index)

Y_validation = []
for y in y_test:
    y_list = list(y)
    index = y_list.index(max(y_list))
    Y_validation.append(index)

print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions, digits=5))

In [31]:
#################################################
#       Test and Train with VGG_a_1  Model      #
#################################################
from matplotlib import pyplot
from keras.applications.vgg16 import VGG16
from keras.models import Model, Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
from tensorflow.keras.optimizers import SGD
from keras.preprocessing.image import ImageDataGenerator
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [32]:
# define VGG model
def define_VGG_model(class_mode, num_classes):
    # load model
    model = VGG16(include_top=False, input_shape=(224, 224, 3))
    # mark loaded layers as trainable / not trainable
    for layer in model.layers:
        layer.trainable = False
    # add new classifier layers
    flat1 = Flatten()(model.layers[-1].output)
    class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
    output = Dense(num_classes, activation='softmax')(class1) #softmax?
    # define new model
    model = Model(inputs=model.inputs, outputs=output)
    # compile model
    opt = SGD(lr=0.001, momentum=0.9)
    model.compile(optimizer=opt, loss=class_mode+'_crossentropy', metrics=['accuracy'])
    return model

In [33]:
# Evaluate the model
def train_test_with_images(train_data_path, test_data_path, class_mode, num_classes, epochs, VGG_flag):
    # define model
    if VGG_flag: 
        model = define_VGG_model(class_mode, num_classes)
    else:
        model = define_model(class_mode, num_classes)
    # create data generator
    datagen = ImageDataGenerator(featurewise_center=True)
    # specify imagenet mean values for centering
    datagen.mean = [123.68, 116.779, 103.939]
    # prepare iterator
    train_it = datagen.flow_from_directory(train_data_path, class_mode=class_mode, 
                                           batch_size=64, target_size=(224, 224))
    test_it = datagen.flow_from_directory(test_data_path, class_mode=class_mode, shuffle=False, 
                                          batch_size=64, target_size=(224, 224))
    # fit model
    history = model.fit_generator(train_it, steps_per_epoch=len(train_it),
                                  validation_data=test_it, validation_steps=len(test_it), epochs=epochs, verbose=0)

    # evaluate model
    _, acc = model.evaluate_generator(test_it, steps=len(test_it), verbose=0)
    print('> %.3f' % (acc * 100.0))

    #Confusion Matrix and Classification Report
    Y_pred = model.predict_generator(test_it)
    print(Y_pred)
    y_pred = np.argmax(Y_pred, axis=1)
    print(y_pred)
    print("=================")
    print(test_it.classes)
    print('Confusion Matrix')
    print(confusion_matrix(test_it.classes, y_pred))
    print('Classification Report')
    print(classification_report(test_it.classes, y_pred, digits=5))
    print('Accuracy Score')
    print(accuracy_score(test_it.classes, y_pred))

In [34]:
print("#################################################")
print("################ VGG_a_1 Results ################")
print("#################################################")

# training and test images should be in the following folders according 
# to the related classes (in other words teams they are assigned).
train_data_path = 'train_images'
test_data_path = 'test_images'
# num_classes has been set before, reset if necessary
class_mode = 'categorical' #binary or categorical
epochs = 10
train_test_with_images(train_data_path, test_data_path, class_mode, num_classes, epochs, VGG_flag=True)