In [103]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import json
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import tensorflow as tf
from keras import backend as K

In [104]:
import sklearn as sk

In [106]:
df = pd.read_csv('../Dataset/jdt_data_cleaned.csv')

In [107]:
df.head()

Unnamed: 0,bug_id,creation_ts,Assignee,Description,text,words
0,9790.0,2/13/2002 16:42,Olivier_Thomann,Add constructors from superclass inserts in wr...,add constructors from superclass inserts in wr...,78
1,15684.0,5/9/2002 17:54,daniel_megert,Invalid Menu Extension on search pages 508Foll...,invalid menu extension on search pages followi...,85
2,21904.0,7/25/2002 10:35,daniel_megert,[misc] Can't Ctrl+C copy from class file edito...,misc cant ctrlc copy from class file editor wi...,66
3,45408.0,10/22/2003 14:49,markus.kell.r,Enable assertions during unit tests [JUnit] As...,enable assertions during unit tests junit as u...,66
4,45507.0,10/24/2003 10:49,sarika.sinha,[evaluation] cannot access inner class fields ...,evaluation cannot access inner class fields in...,91


In [108]:
df = df[df['text'].notnull()]

In [109]:
label_name = 'Assignee'

In [110]:
unique_developers = df[label_name].unique()
developer_dict = {}
for idx, developer in enumerate(unique_developers, start = 1):
  developer_dict[developer] = idx

In [111]:
df[label_name] = df[label_name].astype(str).map(developer_dict)

In [112]:
X = df['text'].values

In [113]:
n_components = 800

In [114]:
Y = df[label_name].values

In [115]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [116]:
Y_encoded = np_utils.to_categorical(encoded_Y)

In [117]:
def top_k_categorical_accuracy(y_true, y_pred, k=5):
    return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k))

In [118]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=101)
svm_cvscores = []
rf_cvscores = []
nb_cvscores = []

In [119]:
for train, test in kfold.split(X, Y):
    x_train = X[train]
    x_test = X[test]
    
    y_train = Y_encoded[train]
    y_test = Y_encoded[test]
    
    #TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(tokenizer= lambda x: x.split(' ') , ngram_range=(1, 2), dtype=np.float32)
    
    train_tfidf = vectorizer.fit_transform(x_train)
    
    test_tfidf = vectorizer.transform(x_test)
    
    svd = TruncatedSVD(n_components=n_components, algorithm='arpack')
    
    train_svd = svd.fit_transform(train_tfidf)
    
    test_svd = svd.transform(test_tfidf)
    
    #SVM
    svmModel = SVC(kernel='linear', probability=True)
    svmModel.fit(train_svd, Y[train])
    smv_y_pred = svmModel.predict_proba(test_svd)

    svm_top_accuracies = []

    for i in range(10):
        acc = top_k_categorical_accuracy(y_test, smv_y_pred, k=i+1)
        svm_top_accuracies.append(acc.numpy())

    svm_cvscores.append(svm_top_accuracies)
    
    #RF
    rfModel = RandomForestClassifier(n_estimators=200)
    rfModel.fit(train_svd, Y[train])
    
    rf_y_pred = rfModel.predict_proba(test_svd)
    rf_top_accuracies = []
    for i in range(10):
        acc = top_k_categorical_accuracy(y_test, rf_y_pred, k=i+1)
        rf_top_accuracies.append(acc.numpy())

    rf_cvscores.append(rf_top_accuracies)
    
    #NB
    nbModel = GaussianNB()
    nbModel.fit(train_svd, Y[train])
    nb_y_pred = nbModel.predict_proba(test_svd)
    nb_top_accuracies = []
    for i in range(10):
        acc = top_k_categorical_accuracy(y_test, nb_y_pred, k=i+1)
        nb_top_accuracies.append(acc.numpy())

    nb_cvscores.append(nb_top_accuracies)

In [1]:
print(f'SVM: {np.array(svm_cvscores).sum(axis=0)/10}')
print(f'RF: {np.array(rf_cvscores).sum(axis=0)/10}')
print(f'NB: {np.array(nb_cvscores).sum(axis=0)/10}')