In [16]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict
#from google.colab import drive
#drive.mount('/content/drive')

In [17]:
url_train_dev = 'https://docs.google.com/spreadsheets/d/1KYIk6fyiTIXe0RkgqNuCvfNnsQtH6lp2fu9bBIorI1s/edit#gid=1591461788'

#url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

# Load and inspect the data

In [33]:
from io import StringIO
import requests

def load_dataset(url):
    r = requests.get(url_train_dev)
    data = r.content.decode('utf8')
    print(data)
    df = pd.read_csv(StringIO(data), sep='\t')
    df.columns = ['bug-id', 'bug', 'label']
    return df

In [56]:
link = "C:/Users/David/Documents/GitHub/Data_Science_for_Software_Engineering/Project/training_list.csv"
df = pd.read_csv(link)
df.columns = ['bug-id', 'bug', 'label']
print(df.head(100))
df_train_dev = df[:7000]
df_test = df[7000:]

ParserError: Error tokenizing data. C error: Expected 3 fields in line 6, saw 4


In [None]:
print('Infos train-dev-set:')
print(df_train_dev.info())
print('Infos test-set:')
print(df_test.info())

In [None]:
df_train_dev.head()

In [None]:
print(df_train_dev.label.unique())

In [None]:
df_train_dev.groupby('label').size().sort_values(ascending = False).plot.bar(figsize=(20, 5))

In [None]:
df_train_dev.groupby('label').size().sort_values(ascending = False)

# Process labels

In [None]:
from sklearn.preprocessing import LabelEncoder
le_fitted = LabelEncoder().fit(df_train_dev['label'])

In [None]:
# map all classes that are not in train_dev to undefined
for i, label in enumerate(df_test['label']):
    df_test['label'][i] = 'und' if label not in le_fitted.classes_ else label
# check if it worked: should return an empty list
print([label for label in df_test['label'] if label not in set(df_train_dev['label'])])

In [None]:
df_train_dev['label'][:10]

In [None]:
y_train_dev, y_test = le_fitted.transform(df_train_dev['label']), le_fitted.transform(df_test['label'])

# Preprocess bugs

Pipeline classes:

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder


class TweetNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self 

    def _normalize_tweet(self, tweet):
        """Remove punctuation and newlines, lowercase, pad with spaces.

        :param tweet: string
        :return: normalized string
        """
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
        tweet = re.sub(r'\n', r'', tweet)
        tweet = tweet.lower()
        tweet = re.sub(r'@\w+\b', r'', tweet)
        tweet = re.sub(r'\b\S+//\S+\b', r'', tweet)
        # tweet = ' ' + tweet + ' '
        return tweet
    
    def transform(self, X, y=None):
        tweets = []
        for tweet in X:
            tweets.append(self._normalize_tweet(tweet))
        return np.array(tweets)


class FeatureExtractor(BaseEstimator, TransformerMixin):


    vowels = set([c for c in 'aeiouäöüàéèëï'])
    consonants = set([c for c in 'bcdfghklmnlpqrstvwxyz'])

    def __init__(self):
        self.scaler = MinMaxScaler()

    def _to_bigrams(self, tweet):
        return [bg[0] + bg[1] for bg in zip(tweet, tweet[1:])]

    def _get_vowel_consonant_ratio(self, tweet):
        vf = 0
        cf = 0
        for c in tweet.lower():
            if c in self.vowels:
                vf =+ 1
            elif c in self.consonants:
                cf += 1
        return vf / (cf + 1)

    def _get_capitalization_ratio(self, tweet):
        up_count = 0
        for c in tweet:
            if c.upper() == c:
                up_count += 1
        return up_count / (len(tweet) + 1)

    def _get_double_char_freq(self, tweet):
        double_freq = 0
        for bg in self._to_bigrams(tweet):
            if bg[0] == bg[1]:
                double_freq += 1
        return double_freq
    
    def _extract_num_features(self, tweets):
        num_features = []
        for tweet in tweets:
            feat_tweet = []
            feat_tweet.append(self._get_vowel_consonant_ratio(tweet))
            feat_tweet.append(self._get_capitalization_ratio(tweet))
            feat_tweet.append(self._get_double_char_freq(tweet))
            num_features.append(feat_tweet)
        return np.array(num_features)
    
    def fit(self, X, y=None):
        numerical_features = self._extract_num_features(X)
        self.scaler.fit(numerical_features)
        return self
    
    def transform(self, X, y=None):
        numerical_features= self._extract_num_features(X)
        return X, self.scaler.transform(numerical_features)


class MatrixToArrayConverter1(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[0].toarray(), X[1]


class MatrixUnifier(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return np.concatenate([X[0].todense(), X[1]], axis=1)


class CountVectorizerWrapper:

    def __init__(self, ngram_range, analyzer, max_features, binary):
        print('args:', str([ngram_range, analyzer, max_features, binary]))
        self.countvec = CountVectorizer(ngram_range=ngram_range, analyzer=analyzer, max_features=max_features, binary=binary)
    
    def fit(self, X, y=None):
        tweets, numerical_features = X
        self.countvec.fit(tweets)
        return self
    
    def transform(self, X, y=None):
        tweets, numerical_features = X
        return self.countvec.transform(tweets), numerical_features


class OneHotEncoderWrapper:

    def __init__(self, handle_unknown):
        self.ohe = OneHotEncoder(handle_unknown=handle_unknown)
    
    def fit(self, X, y=None):
        self.ohe.fit(X[0])
        return self
    
    def transform(self, X, y=None):
        return self.ohe.transform(X[0]), X[1]


Helper classes for the pipeline:

In [None]:
class GenericClassifier(BaseEstimator):

    def __init__(self, estimator):
        self.clf = clf


    def fit(self, X, y=None, **kwargs):
        self.clf.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.clf.predict(X)


    def predict_proba(self, X):
        return self.clf.predict_proba(X)


    def score(self, X, y):
        return self.clf.score(X, y)


class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        import pdb; pdb.set_trace()
        return X

    def fit(self, X, y=None, **fit_params):
        return self

# GridSearch and Training

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

In [None]:
clf_param_grid = {
    'MultinomialNB': [MultinomialNB, {'CLF__alpha': [0.1, 1]}],
    'SGDClassifier': [SGDClassifier, {'CLF__loss': ['hinge', 'log'], 'CLF__penalty': ['l2', 'l1'], 'CLF__max_iter': [100, 300], 'CLF__early_stopping': [True, False]}]
}

In [None]:
models = []
for clf_name in clf_param_grid:
    print(30*'-')
    print(clf_name)
    param_grid = clf_param_grid[clf_name][1]
    print(param_grid)
    bigram_vec_args = dict(ngram_range=(2,2), analyzer='char_wb', max_features=100, binary=True)
    pipe = Pipeline(steps=[
        ('TweetNormalizer', TweetNormalizer()),
        ('FeatureExtractor', FeatureExtractor()),
        ('BigramVectorizer', CountVectorizerWrapper(**bigram_vec_args)),
        ('MatrixToArrayConverter', MatrixToArrayConverter1()),
        ('OneHotEncoder', OneHotEncoderWrapper(handle_unknown='ignore')),
        ('MatrixUnifier', MatrixUnifier()),
        ('CLF', clf_param_grid[clf_name][0]())
    ], verbose=True)
    grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid, scoring='f1_micro', cv=10)
    grid.fit(df_train_dev['bug'].to_numpy(), y_train_dev)
    models.append(grid)

# Results

Micro f1-Score of the naive base models on the dev set:

In [None]:
models[0].cv_results_

Micro and macro f1-score of the best naive bayes model on the test set:

In [None]:
from sklearn.metrics import f1_score
preds = models[0].predict(df_test['bug'].to_numpy())
f1_micro = f1_score(preds, y_test, average='micro')
f1_macro = f1_score(preds, y_test, average='macro')
print(f'F1-micro-score on the testset: {f1_micro}')
print(f'F1-macro-score on the testset: {f1_macro}')

Micro f1-Score of the SGD models on the dev set:

In [None]:
models[1].cv_results_

Accuracy of the best SGD model on the test set:

In [None]:
preds = models[1].predict(df_test['bug'].to_numpy())
f1_micro = f1_score(preds, y_test, average='micro')
f1_macro = f1_score(preds, y_test, average='macro')
print(f'F1-micro-score on the testset: {f1_micro}')
print(f'F1-macro-score on the testset: {f1_macro}')

Let's check the confusion matrix:

In [None]:
num_classes = len(le_fitted.classes_)
def create_confusion_matrix(num_classes, preds, y_test):
    """Create confusion matrix 'by hand' since test set does not contain all labels (thanks to Sarah Kiener)."""
    df = pd.DataFrame(np.zeros((num_classes, num_classes), dtype=int))
    for i, j in zip(preds, y_test):
        df.iloc[i, j] += 1
    df.columns = le_fitted.classes_
    df.index = le_fitted.classes_
    return df
df = create_confusion_matrix(num_classes, preds, y_test)
df