In [1]:
import pandas
import re
import jpype
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import model_selection
import joblib
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from numpy import mean
from numpy import std

In [2]:
def load(filePath):
    #load the dataset
    dataset = pandas.read_csv(filePath, encoding="ISO-8859-9", delimiter=";")

    #clean the spaces at the beginning and end of column names.
    dataset.rename(columns=lambda x: x.strip(), inplace=True)

    return dataset

In [3]:
zemberek = None

# ====================================================================================================++
# Start JVM
# Change the address below according to the java version and the location of the jar file on your computer.
jpype.startJVM(".../jvm.dll",
               "-Djava.class.path=zemberek-tum-2.0.jar", "-ea")

# set the language as Turkish (as spoken in Turkey)
Tr = jpype.JClass("net.zemberek.tr.yapi.TurkiyeTurkcesi")
# tr object
tr = Tr()
# load the Zemberek class
Zemberek = jpype.JClass("net.zemberek.erisim.Zemberek")
ZemberekC = jpype.JClass("net.zemberek.araclar.turkce.YaziIsleyici")
# zemberek object
zemberek = Zemberek(tr)
# ====================================================================================================++

In [4]:
stop_word_list = [] # list of stop words to be extracted from the text

# These letters are specific to Turkish language. Below is a map for lowering them specifically
lower_map_turkish = {
    ord(u'I'): u'ı',
    ord(u'İ'): u'i',
    ord(u'Ç'): u'ç',
    ord(u'Ş'): u'ş',
    ord(u'Ö'): u'ö',
    ord(u'Ü'): u'ü',
    ord(u'Ğ'): u'ğ'
    }

# List of words or word roots indicating related patterns are added in the below list

# list of words that do not require morphological analysis
ob_non_root = ["inaktif", "olarak", "şeklinde", "birbirine", "defa", "ancak", "hala", "hiç", "eski",
               "ancak", "lakin", "fakat", "rağmen", "sadece", "inaktif", "mükerrer", "tekrar", "ne"]
eb_non_root = []
s2r_non_root = []
# list of word roots that require morphological analysis by removing the suffixes
ob_root = ["ek", "hata", "uyarı", "mesaj", "sorun", "fark", "geri", "takı",
           "aşırı", "şifre", 'geç', "örnek", "iptal", "şikayet", "hal",
           "uyuş", "askı", "eksik", "deney", "don", "yut"]
eb_root = ["bekle", "iste", "gerek"]
s2r_root = []

In [5]:
def n_grams_based_extractor(text):
    """
    lower the characters in the text, remove all non-word characters and stop words, 
    then returns the remaining words as features.
    """
    # Remove digits
    wordsNoDigit = []
    for word in text.split():
        wordsNoDigit.append(re.sub('[\d]+|,|;|\.', ' ', word))
    textNoDigit = ' '.join(wordsNoDigit)

    # Remove all non-word characters from the text via the regex[\W]+,
    # Convert the text into lowercase characters
    # print(text)
    text_tr = textNoDigit.translate(lower_map_turkish)
    lowerText = re.sub('[\W]+', ' ', text_tr.lower())

    #remove stopwords
    noStopWordsText = [word for word in lowerText.split() if ((word not in stop_word_list) and (len(word) > 1))]
    
    return ' '.join(noStopWordsText)

In [6]:
def ma_based_extractor(text, include_input):
    """
    morphological analysis based extractor: use zemberek to extract the word root and the suffixes, 
    and return them to be used as features
    
    include_input: 0>Return only ma results or patterns results, 1>Return the input text as well
    """
    resultWordList = []
    for word in text.split():
        if zemberek.kelimeDenetle(word): # Zemberek checks if the word is valid
            yanit = zemberek.kelimeCozumle(word) # Zemberek analyzes the word
            if yanit:
                strSuffixes = str(yanit[0].ekler()).replace('[', '').replace(']', '').replace(' ', '') 
                rootIsVerb = False
                for morpheme in strSuffixes.split(","):
                    if morpheme[:4] == "FIIL": 
                        rootIsVerb = True
                if rootIsVerb:
                    for morpheme in strSuffixes.split(","):
                        resultWordList.append(morpheme) # add the morphemes to the list of words to be returned
            else:
                print("{} COUND NOT BE ANALYZED".format(word))
        else:
            print("{} UNKNOWN WORD".format(word))
    if include_input == 0:
        return ' '.join(resultWordList)
    else: 
        return text + " " + ' '.join(resultWordList)

In [7]:
def patterns_based_extractor(text, include_input, classification_option):
    """
    after morphological analysis, patterns extractor uses the matching parts of 
    the morphological analysis results as features
    
    include_input: 0>Return only ma results or patterns results, 1>Return the input text as well
    
    classification_option: ob, eb, s2r
    """
    resultWordList = []
    for word in text.split():
        if (classification_option == 'ob') and (word in ob_non_root):
            resultWordList.append(word)
        elif (classification_option == 'eb') and word in (eb_non_root):
            resultWordList.append(word)
        elif (classification_option == 's2r') and word in (s2r_non_root):
            resultWordList.append(word)
        elif zemberek.kelimeDenetle(word): # Zemberek checks if the word is valid
            yanit = zemberek.kelimeCozumle(word) # Zemberek analyzes the word
            if yanit:
                strSuffixes = str(yanit[0].ekler()).replace('[', '').replace(']', '').replace(' ', '')
                rootIsVerb = False
                for morpheme in strSuffixes.split(","):
                    if morpheme[:4] == "FIIL":
                        rootIsVerb = True
                if rootIsVerb:
                    for morpheme in strSuffixes.split(","):
                        if (classification_option == 'ob') and (morpheme == "FIIL_OLUMSUZLUK_ME"): # OB pattern -me -ma
                            resultWordList.append("FIIL_OLUMSUZLUK_ME")
                        if (classification_option == 'eb') and (morpheme == "ISIM_BULUNMA_LI"): # EB pattern -meli
                            resultWordList.append("FIIL_DONUSUM_ME")
                            resultWordList.append("ISIM_BULUNMA_LI")
                        if (classification_option == 'eb') and (morpheme == "FIIL_YETENEK_EBIL"):  # EB pattern -ebil
                            resultWordList.append("FIIL_YETENEK_EBIL")
                        if (classification_option == 's2r') and (morpheme == "ISIM_KALMA_DE"):  # S2R pattern -de -da
                            resultWordList.append("ISIM_KALMA_DE")
                            if prevMorpheme == "FIIL_MASTAR_MEK":
                                resultWordList.append("FIIL_MASTAR_MEK")
                        if (classification_option == 's2r') and (morpheme == "IMEK_ZAMAN_KEN"):  # S2R pattern -ken
                            resultWordList.append("IMEK_ZAMAN_KEN")
                        if (classification_option == 's2r') and (morpheme == "FIIL_GECMISZAMAN_MIS"):  # S2R pattern -miş
                            resultWordList.append("FIIL_GECMISZAMAN_MIS")
                        prevMorpheme = morpheme
                if (classification_option == 'ob') and (str(yanit[0].kok()).split(" ")[0] in ob_root):
                    resultWordList.append(str(yanit[0].kok()).split(" ")[0])
                if (classification_option == 'eb') and (str(yanit[0].kok()).split(" ")[0] in eb_root):
                    resultWordList.append(str(yanit[0].kok()).split(" ")[0])
                if (classification_option == 's2r') and (str(yanit[0].kok()).split(" ")[0] in s2r_root):
                    resultWordList.append(str(yanit[0].kok()).split(" ")[0])                    
            else:
                print("{} COULD NOT BE ANALYZED".format(word))
        else:
            print("{} UNKNOWN WORD".format(word))
    if include_input == 0:
        return ' '.join(resultWordList)
    else: 
        return text + " " + ' '.join(resultWordList)

In [8]:
inputfileName = "data/issueReports.csv"

dataset = load(inputfileName)
print("Dataset length: " + str(len(dataset)))

In [9]:
# Header names in the input file
CNAME_DESCRIPTION = "DESCRIPTION"
CNAME_QUAL_FLAG = "OB"
# function inputs
classification_option = 'ob' #ob, eb, s2r
include_input = 0 # 0: Return only ma/patterns results, 1: Return the input text as well

features = 'n_grams' # options> 'n_grams', 'ma', 'patterns', 'n_grams+ma', 'n_grams+patterns', 'n_grams+ma+patterns'
dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(n_grams_based_extractor)
if features == 'n_grams':
    pass
elif features == 'ma':
    include_input = 0 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(ma_based_extractor, args=(include_input,))
elif features == 'patterns':
    include_input = 0 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(patterns_based_extractor, 
                                                                  args=(include_input, classification_option,))
elif features == 'n_grams+ma':
    include_input = 1 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(ma_based_extractor, args=(include_input,))
elif features == 'n_grams+patterns':
    include_input = 1 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(patterns_based_extractor, 
                                                                  args=(include_input, classification_option,))
elif features == 'n_grams+ma+patterns':
    include_input = 1 
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(ma_based_extractor, args=(include_input,))
    dataset[CNAME_DESCRIPTION] = dataset[CNAME_DESCRIPTION].apply(patterns_based_extractor, 
                                                                  args=(include_input, classification_option,))

X = dataset[CNAME_DESCRIPTION].values
Y = dataset[CNAME_QUAL_FLAG].values
seed = 7

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_tfidf = vectorizer.fit_transform(X)

In [11]:
# create dataset
X, y = X_tfidf, Y

In [12]:
# configure the cross-validation procedure
cv_outer = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [13]:
# enumerate splits
outer_results = list()

In [14]:
for train_ix, test_ix in cv_outer.split(X, y):
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    # configure the cross-validation procedure
    cv_inner = model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    # define the model
    model = LinearSVC(class_weight='balanced')
    # define search space
    space = dict()
    space['C'] = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    # define search
    search = model_selection.GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True)
    # execute search
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # evaluate the model
    acc = accuracy_score(y_test, yhat)
    print(accuracy_score(y_test, yhat))
    print(confusion_matrix(y_test, yhat))
    print(classification_report(y_test, yhat, digits=4))
    # store the result
    outer_results.append(acc)
    # report progress
    print('>acc=%.3f, best=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
    print("====================================")

In [15]:
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))