In [1]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTj6g3GFhmQ6N_K--THLISk2TzpfGkI6vTK2_jaENX6INU2BFPr1pIc6Fs5DryL2Q/pub?gid=1844931619&single=true&output=tsv'
url_train = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQ2v-WuHn21qUQSz5RsEKmDCkxnKu9Zgcc-sVnV--hx_83vRfILre3Tc-btU4VUUg/pub?gid=207380422&single=true&output=tsv'

# Load and inspect the data

In [3]:
from io import StringIO
import requests
r_train = requests.get(url_train)
data_train = r_train.content.decode('utf8')
df_train_dev = pd.read_csv(StringIO(data_train), sep = '\t')
df_train_dev.columns= ['bug', 'label']
df_train_dev.dropna(subset=['label'], inplace=True)


r_test = requests.get(url_test)
data_test= r_test.content.decode('utf8')
df_test = pd.read_csv(StringIO(data_test), sep = '\t')
df_test.columns= ['bug', 'label']
df_test.dropna(subset=['label'], inplace=True)

In [4]:
print('Infos train-dev-set:')
print(df_train_dev.info())
print('Infos test-set:')
print(df_test.info())

Infos train-dev-set:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11916 entries, 0 to 11915
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   bug     11916 non-null  object
 1   label   11916 non-null  object
dtypes: object(2)
memory usage: 279.3+ KB
None
Infos test-set:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1199 entries, 0 to 1198
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   bug     1199 non-null   object
 1   label   1199 non-null   object
dtypes: object(2)
memory usage: 28.1+ KB
None


In [5]:
df_train_dev.head()

Unnamed: 0,bug,label
0,Garbage characters in form list boxes.,blocker
1,Views that have visibility hidden still eat ev...,blocker
2,[BLOCKER]Need architecture for focused text wi...,blocker
3,navigator.javaEnabled() is always false,blocker
4,[BLOCK] Native Gtk combobox is a grey rectangl...,blocker


In [6]:
print(df_train_dev.label.unique())

['blocker' 'critical' 'major' 'normal' 'minor' 'trivial']


In [7]:
df_train_dev.groupby('label').size().sort_values(ascending = False).plot.bar(figsize=(20, 5))

<matplotlib.axes._subplots.AxesSubplot at 0x21c50764c88>

In [8]:
df_train_dev.groupby('label').size().sort_values(ascending = False)

label
trivial     2000
minor       2000
major       2000
critical    2000
normal      1965
blocker     1951
dtype: int64

# Process labels

In [9]:
from sklearn.preprocessing import LabelEncoder
le_fitted = LabelEncoder().fit(df_train_dev['label'])

In [10]:
# map all classes that are not in train_dev to undefined
for i, label in enumerate(df_test['label']):
    df_test['label'][i] = 'und' if label not in le_fitted.classes_ else label
# check if it worked: should return an empty list
print([label for label in df_test['label'] if label not in set(df_train_dev['label'])])

[]


In [11]:
df_train_dev['label'][:10]

0    blocker
1    blocker
2    blocker
3    blocker
4    blocker
5    blocker
6    blocker
7    blocker
8    blocker
9    blocker
Name: label, dtype: object

In [12]:
y_train_dev, y_test = le_fitted.transform(df_train_dev['label']), le_fitted.transform(df_test['label'])

In [13]:
x_train = df_train_dev.bug
y_train = y_train_dev
x_test = df_test.bug
y_test = y_test

In [14]:
# Creating a pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

text_clf = Pipeline([
                      ('ngram_count_vect', CountVectorizer(ngram_range=(1, 3), analyzer='word')), # Convert a collection of text documents to a matrix of unigram, bigram and trigram counts
                      ('tfidf_ngram_transformer', TfidfTransformer(smooth_idf=True)) # Convert to tf-idf measure
])

# Pipelining training data
x_train_prepared = text_clf.fit_transform(x_train)

# Pipelining our test and dev data
x_test_prepared = text_clf.transform(x_test)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Train a Naïve Bayes classifier
mlp_clf = MLPClassifier(activation='relu', alpha=0.001, early_stopping=True, hidden_layer_sizes=(400, 200), learning_rate='adaptive', solver='adam')
mlp_clf.fit(x_train_prepared, y_train)

accuracy_score(mlp_clf.predict(x_test_prepared), y_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
# Train a Naïve Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(x_train_prepared, y_train)
accuracy_score(nb_clf.predict(x_test_prepared), y_test)

In [None]:
from sklearn.linear_model import SGDClassifier

# Train a SGD classifier
sgd_clf = SGDClassifier()
sgd_clf.fit(x_train_prepared, y_train)
accuracy_score(sgd_clf.predict(x_test_prepared), y_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score



# Scores of NB classifier
print('Scores of Naïve Bayes classifier:')
y_train_predictions_nb = cross_val_predict(nb_clf, x_train_prepared, y_train, cv=3)
print(f'precision: {precision_score(y_train, y_train_predictions_nb, average="micro")}')
print(f'recall: {recall_score(y_train, y_train_predictions_nb, average="micro")}')
print(f'f1: {f1_score(y_train, y_train_predictions_nb, average="micro")}')

# Scores of SGD classifier
print('Scores of SGD classifier:')
y_train_predictions_sgd = cross_val_predict(sgd_clf, x_train_prepared, y_train, cv=3)
print(f'precision: {precision_score(y_train, y_train_predictions_sgd, average="micro")}')
print(f'recall: {recall_score(y_train, y_train_predictions_sgd, average="micro")}')
print(f'f1: {f1_score(y_train, y_train_predictions_sgd, average="micro")}')


# Scores of MLP classifier
print('Scores of SGD classifier:')
y_train_predictions_mlp = cross_val_predict(mlp_clf, x_train_prepared, y_train, cv=3)
print(f'precision: {precision_score(y_train, y_train_predictions_mlp, average="micro")}')
print(f'recall: {recall_score(y_train, y_train_predictions_mlp, average="micro")}')
print(f'f1: {f1_score(y_train, y_train_predictions_mlp, average="micro")}')


In [None]:
import matplotlib.pyplot as plt

# Confusion matrices
conf_mx_nb = confusion_matrix(y_train, y_train_predictions_nb)
conf_mx_sgd = confusion_matrix(y_train, y_train_predictions_sgd)
conf_mx_mlp = confusion_matrix(y_train, y_train_predictions_mlp)
# NB classifier
plt.matshow(conf_mx_nb, cmap=plt.cm.gray)
plt.show

# SGD classifier
plt.matshow(conf_mx_sgd, cmap=plt.cm.gray)
plt.show

# MLP CLassifier
plt.matshow(conf_mx_mlp, cmap=plt.cm.gray)
plt.show