In [1]:
import pandas
import re
import jpype
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from numpy import mean
from numpy import std
from sklearn.utils import class_weight
import numpy as np
import ktrain

In [2]:
def load(filePath):
    #load the dataset
    dataset = pandas.read_csv(filePath, encoding="ISO-8859-9", delimiter=";")

    #clean the spaces at the beginning and end of column names.
    dataset.rename(columns=lambda x: x.strip(), inplace=True)

    return dataset

def filterProject(self, dataset, projectCode):
    datasetP = dataset[(dataset[CNAME_PROJECT] == projectCode)]
    datasetO = dataset[(dataset[CNAME_PROJECT] != projectCode)]

    return datasetO, datasetP

In [3]:
zemberek = None

# ====================================================================================================++
# Start JVM
# Change the address below according to the java version and the location of the jar file on your computer.
jpype.startJVM(".../jvm.dll",
               "-Djava.class.path=zemberek-tum-2.0.jar", "-ea")

# set the language as Turkish (as spoken in Turkey)
Tr = jpype.JClass("net.zemberek.tr.yapi.TurkiyeTurkcesi")
# tr object
tr = Tr()
# load the Zemberek class
Zemberek = jpype.JClass("net.zemberek.erisim.Zemberek")
ZemberekC = jpype.JClass("net.zemberek.araclar.turkce.YaziIsleyici")
# zemberek object
zemberek = Zemberek(tr)
# ====================================================================================================++

In [4]:
stop_word_list = [] # list of stop words to be extracted from the text

# These letters are specific to Turkish language. Below is a map for lowering them specifically
lower_map_turkish = {
    ord(u'I'): u'ı',
    ord(u'İ'): u'i',
    ord(u'Ç'): u'ç',
    ord(u'Ş'): u'ş',
    ord(u'Ö'): u'ö',
    ord(u'Ü'): u'ü',
    ord(u'Ğ'): u'ğ'
    }

# List of words or word roots indicating related patterns are added in the below list

# list of words that do not require morphological analysis
ob_non_root = ["inaktif", "olarak", "şeklinde", "birbirine", "defa", "ancak", "hala", "hiç", "eski",
               "ancak", "lakin", "fakat", "rağmen", "sadece", "inaktif", "mükerrer", "tekrar", "ne"]
eb_non_root = []
s2r_non_root = []
# list of word roots that require morphological analysis by removing the suffixes
ob_root = ["ek", "hata", "uyarı", "mesaj", "sorun", "fark", "geri", "takı",
           "aşırı", "şifre", 'geç', "örnek", "iptal", "şikayet", "hal",
           "uyuş", "askı", "eksik", "deney", "don", "yut"]
eb_root = ["bekle", "iste", "gerek"]
s2r_root = []

In [5]:
def n_grams_based_extractor(text):
    """
    lower the characters in the text, remove all non-word characters and stop words, 
    then returns the remaining words as features.
    """
    # Remove digits
    wordsNoDigit = []
    for word in text.split():
        wordsNoDigit.append(re.sub('[\d]+|,|;|\.', ' ', word))
    textNoDigit = ' '.join(wordsNoDigit)

    # Remove all non-word characters from the text via the regex[\W]+,
    # Convert the text into lowercase characters
    # print(text)
    text_tr = textNoDigit.translate(lower_map_turkish)
    lowerText = re.sub('[\W]+', ' ', text_tr.lower())

    #remove stopwords
    noStopWordsText = [word for word in lowerText.split() if ((word not in stop_word_list) and (len(word) > 1))]
    
    return ' '.join(noStopWordsText)

In [6]:
def ma_based_extractor(text, include_input):
    """
    morphological analysis based extractor: use zemberek to extract the word root and the suffixes, 
    and return them to be used as features
    
    include_input: 0>Return only ma results or patterns results, 1>Return the input text as well
    """
    resultWordList = []
    for word in text.split():
        if zemberek.kelimeDenetle(word): # Zemberek checks if the word is valid
            yanit = zemberek.kelimeCozumle(word) # Zemberek analyzes the word
            if yanit:
                strSuffixes = str(yanit[0].ekler()).replace('[', '').replace(']', '').replace(' ', '') 
                rootIsVerb = False
                for morpheme in strSuffixes.split(","):
                    if morpheme[:4] == "FIIL": 
                        rootIsVerb = True
                if rootIsVerb:
                    for morpheme in strSuffixes.split(","):
                        resultWordList.append(morpheme) # add the morphemes to the list of words to be returned
            else:
                print("{} COUND NOT BE ANALYZED".format(word))
        else:
            print("{} UNKNOWN WORD".format(word))
    if include_input == 0:
        return ' '.join(resultWordList)
    else: 
        return text + " " + ' '.join(resultWordList)

In [7]:
def patterns_based_extractor(text, include_input, classification_option, addAllVerbSuffixes):
    """
    after morphological analysis, patterns extractor uses the matching parts of 
    the morphological analysis results as features
    
    include_input: 0>Return only ma results or patterns results, 1>Return the input text as well
    
    classification_option: ob, eb, s2r
    
    addAllVerbSuffixes: True -> patterns+ma / False: patterns 
    """
    resultWordList = []
    for word in text.split():
        if (classification_option == 'ob') and (word in ob_non_root):
            resultWordList.append(word)
        elif (classification_option == 'eb') and word in (eb_non_root):
            resultWordList.append(word)
        elif (classification_option == 's2r') and word in (s2r_non_root):
            resultWordList.append(word)
        elif zemberek.kelimeDenetle(word): # Zemberek checks if the word is valid
            yanit = zemberek.kelimeCozumle(word) # Zemberek analyzes the word
            if yanit:
                strSuffixes = str(yanit[0].ekler()).replace('[', '').replace(']', '').replace(' ', '')
                rootIsVerb = False
                for morpheme in strSuffixes.split(","):
                    if morpheme[:4] == "FIIL":
                        rootIsVerb = True
                if rootIsVerb:
                    for morpheme in strSuffixes.split(","):
                        if addAllVerbSuffixes:
                            resultWordList.append(morpheme) # add the morphemes to the list of words to be returned
                        if (classification_option == 'ob') and (morpheme == "FIIL_OLUMSUZLUK_ME"): # OB pattern -me -ma
                            resultWordList.append("FIIL_OLUMSUZLUK_ME")
                        if (classification_option == 'eb') and (morpheme == "ISIM_BULUNMA_LI"): # EB pattern -meli
                            resultWordList.append("FIIL_DONUSUM_ME")
                            resultWordList.append("ISIM_BULUNMA_LI")
                        if (classification_option == 'eb') and (morpheme == "FIIL_YETENEK_EBIL"):  # EB pattern -ebil
                            resultWordList.append("FIIL_YETENEK_EBIL")
                        if (classification_option == 's2r') and (morpheme == "ISIM_KALMA_DE"):  # S2R pattern -de -da
                            resultWordList.append("ISIM_KALMA_DE")
                            if prevMorpheme == "FIIL_MASTAR_MEK":
                                resultWordList.append("FIIL_MASTAR_MEK")
                        if (classification_option == 's2r') and (morpheme == "IMEK_ZAMAN_KEN"):  # S2R pattern -ken
                            resultWordList.append("IMEK_ZAMAN_KEN")
                        if (classification_option == 's2r') and (morpheme == "FIIL_GECMISZAMAN_MIS"):  # S2R pattern -miş
                            resultWordList.append("FIIL_GECMISZAMAN_MIS")
                        prevMorpheme = morpheme
                if (classification_option == 'ob') and (str(yanit[0].kok()).split(" ")[0] in ob_root):
                    resultWordList.append(str(yanit[0].kok()).split(" ")[0])
                if (classification_option == 'eb') and (str(yanit[0].kok()).split(" ")[0] in eb_root):
                    resultWordList.append(str(yanit[0].kok()).split(" ")[0])
                if (classification_option == 's2r') and (str(yanit[0].kok()).split(" ")[0] in s2r_root):
                    resultWordList.append(str(yanit[0].kok()).split(" ")[0])                    
            else:
                print("{} COULD NOT BE ANALYZED".format(word))
        else:
            print("{} UNKNOWN WORD".format(word))
    if include_input == 0:
        print("patterns_based_extractor0:", ' '.join(resultWordList))
        return ' '.join(resultWordList)
    else: 
        return text + " " + ' '.join(resultWordList)

In [8]:
inputfileName = "data/issueReports.csv"

dataset = load(inputfileName)
print("Dataset length: " + str(len(dataset)))

In [9]:
# Header names in the input file
CNAME_DESCRIPTION = "DESCRIPTION"
CNAME_QUAL_FLAG = "OB"
# function inputs
classification_option = 'ob' #ob, eb, s2r
include_input = 0 # 0: Return only ma/patterns results, 1: Return the input text as well

features = 'n_grams' # options> 'n_grams', 'ma', 'patterns', 'n_grams+ma', 'n_grams+patterns', 'n_grams+ma+patterns'
addAllVerbSuffixes = False # True: patterns+ma
dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(n_grams_based_extractor)
if features == 'n_grams':
    pass
elif features == 'ma':
    include_input = 0 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(ma_based_extractor, args=(include_input,))
elif features == 'patterns':
    include_input = 0 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(patterns_based_extractor, 
                                                                  args=(include_input, classification_option, addAllVerbSuffixes,))
elif features == 'patterns+ma':
    include_input = 0 
    addAllVerbSuffixes = True
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(patterns_based_extractor, 
                                                                  args=(include_input, classification_option, addAllVerbSuffixes,))
elif features == 'n_grams+ma':
    include_input = 1 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(ma_based_extractor, args=(include_input,))
elif features == 'n_grams+patterns':
    include_input = 1 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(patterns_based_extractor, 
                                                                  args=(include_input, classification_option, addAllVerbSuffixes,))
elif features == 'n_grams+ma+patterns':
    include_input = 1 
    #dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(ma_based_extractor, args=(include_input,))
    addAllVerbSuffixes = True
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(patterns_based_extractor, 
                                                                  args=(include_input, classification_option, addAllVerbSuffixes))

In [10]:
classes =  list(set(dataset[CNAME_QUAL_FLAG]))

# Handling class imbalance
class_weights = class_weight.compute_class_weight(class_weight = "balanced", 
                                                  classes = np.unique(dataset[CNAME_QUAL_FLAG]), 
                                                  y = dataset[CNAME_QUAL_FLAG])
print(class_weights)

weights={}
for index, weight in enumerate(class_weights):
    weights[index]=weight
print(weights)

In [11]:
projectCodeList = ["UK0811", "UK1440", "UK0137", "UK0451", "UK0698", "UK0213", "UK0196", "UK0440", "UK0199", "UK1148"] 
results = [0.0, 0.0, 0.0, 0.0]

#for train, test in kfold.split(dataset[CNAME_DESCRIPTION], dataset[CNAME_QUAL_FLAG]):
for projectCode in projectCodeList: 
    datasetTrain, datasetTest = filterProject(dataset, projectCode) 
    print("Dataset length: " + str(len(datasetTrain)), len(datasetTest)) 
    X_train = datasetTrain[CNAME_DESCRIPTION].values 
    y_train = datasetTrain[CNAME_QUAL_FLAG].values 

    X_test = datasetTest[CNAME_DESCRIPTION].values 
    y_test = datasetTest[CNAME_QUAL_FLAG].values

    print(len(y_train), len(y_test)) 
    
    # summarize train and test composition
    train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
    test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
    print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))
    
    # Pre Processing 
    trn, val, preproc = ktrain.text.texts_from_array(x_train=X_train, y_train=y_train,
                                                     x_test=X_test, y_test=y_test,
                                                     class_names=classes,
                                                     val_pct=0.1,
                                                     max_features=1000,
                                                     maxlen=100,
                                                     preprocess_mode='distilbert',
                                                     ngram_range=(1,2))

    # Model 
    model = ktrain.text.text_classifier('distilbert', train_data=trn, preproc=preproc)
    
    learner =  ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=16)

    # Applying the weights
    learner.autofit(1e-4, 2, class_weight = weights)

    predictor = ktrain.get_predictor(learner.model, preproc)
    yhat = []
    for x_t in X_test:
        yhat.append(predictor.predict(x_t))

    print(accuracy_score(y_test, yhat))
    print(precision_score(y_test, yhat))
    print(recall_score(y_test, yhat))
    print(f1_score(y_test, yhat))
    results[0] = results[0] + accuracy_score(y_test, yhat)
    results[1] = results[1] + precision_score(y_test, yhat)
    results[2] = results[2] + recall_score(y_test, yhat)
    results[3] = results[3] + f1_score(y_test, yhat)
print(results)