# Loading the Dataset

In [1]:
from array import array
from pathlib import Path

import confuse
import pandas

from classifier import DialogueActClassifierFactory
from pandas import DataFrame
from sklearn import metrics

cfg = confuse.LazyConfig('ccc4prc', __name__)
# Add overrides on top of config.yaml for the workspace.
cfg.set_file('./config.workspace.yaml')

dac_factory = DialogueActClassifierFactory()
dac_labels = dac_factory.get_classifier(
    Path(cfg['dialogue_act_classification']['classifier_file'].as_filename()),
    cfg['dialogue_act_classification']['test_set_percentage'].as_number()).labels()

dataset_dir = Path(cfg['machine_learning']['labeled_seed_excel_file'].as_filename()).parent
training_dataset_file = dataset_dir / ('training_dataset.csv')
test_dataset_file = dataset_dir / ('test_dataset.csv')

training_dataset = pandas.read_csv(training_dataset_file)
test_dataset = pandas.read_csv(test_dataset_file)

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'code_comprehension_related'

entire_dataset = pandas.concat([training_dataset, test_dataset], axis=0).reset_index(drop=True)

# Determine the Machine Learning Algorithm

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

is_author_categories = [
    False,  # 0 should come before 1 for numerical columns.
    True
]

from nlp import LemmaTokenizer

column_transformer = ColumnTransformer(
    transformers=[
        (
            'body_tfidf_vectorizer',
            TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', ngram_range=(1, 2)),
            'body'
        ),
        (
            'dac_transformer',
            OneHotEncoder(categories=[dac_labels]),
            ['dialogue_act_classification_ml']
        ),
        (
            'is_author_transformer',
            OneHotEncoder(categories=[is_author_categories]),
            ['comment_is_by_author']
        ),
    ],
    transformer_weights={
        'body_tfidf_vectorizer': 4,
        'dac_transformer': 1,
        'is_author_transformer': 2,
    },
    verbose=False)

from sklearn.linear_model import LogisticRegression

clf_logistic_regression = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', LogisticRegression(C=500000, solver='lbfgs')),
    ],
    verbose=False)

from sklearn.svm import SVC

clf_svc = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', SVC(kernel='linear', C=0.1, probability=True)),
    ],
    verbose=False)

from sklearn.tree import DecisionTreeClassifier

clf_decision_tree = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', DecisionTreeClassifier())],
    verbose=False)

from sklearn.naive_bayes import MultinomialNB

clf_multinominal_nb = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', MultinomialNB(fit_prior=True))],
    verbose=False)

from sklearn.linear_model import SGDClassifier

clf_sgd = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', SGDClassifier())
    ],
    verbose=False)

# Performance with splitting into Training and Test Dataset

In [3]:
X_train = training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = training_dataset[LABEL]
y_true = test_dataset[LABEL]

clf_logistic_regression.fit(X_train, y_train)
y_pred = clf_logistic_regression.predict(X_test)

print(metrics.classification_report(y_true, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.92879257 0.95238095 0.94043887       315
         Yes  0.75806452 0.67142857 0.71212121        70

    accuracy                      0.90129870       385
   macro avg  0.84342854 0.81190476 0.82628004       385
weighted avg  0.89775111 0.90129870 0.89892657       385



In [4]:
clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)

print(metrics.classification_report(y_true, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.92834891 0.94603175 0.93710692       315
         Yes  0.73437500 0.67142857 0.70149254        70

    accuracy                      0.89610390       385
   macro avg  0.83136195 0.80873016 0.81929973       385
weighted avg  0.89308093 0.89610390 0.89426794       385



In [5]:
clf_decision_tree.fit(X_train, y_train)
y_pred = clf_decision_tree.predict(X_test)

print(metrics.classification_report(y_true, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.93114754 0.90158730 0.91612903       315
         Yes  0.61250000 0.70000000 0.65333333        70

    accuracy                      0.86493506       385
   macro avg  0.77182377 0.80079365 0.78473118       385
weighted avg  0.87321162 0.86493506 0.86834800       385



In [6]:
clf_multinominal_nb.fit(X_train, y_train)
y_pred = clf_multinominal_nb.predict(X_test)

print(metrics.classification_report(y_true, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.82198953 0.99682540 0.90100430       315
         Yes  0.66666667 0.02857143 0.05479452        70

    accuracy                      0.82077922       385
   macro avg  0.74432810 0.51269841 0.47789941       385
weighted avg  0.79374901 0.82077922 0.74714798       385



In [7]:
clf_sgd.fit(X_train, y_train)
y_pred = clf_sgd.predict(X_test)

print(metrics.classification_report(y_true, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.90214067 0.93650794 0.91900312       315
         Yes  0.65517241 0.54285714 0.59375000        70

    accuracy                      0.86493506       385
   macro avg  0.77865654 0.73968254 0.75637656       385
weighted avg  0.85723735 0.86493506 0.85986619       385



# Performance with an Entire Dataset and K-folds Cross-Validation

In [8]:
X = entire_dataset[FEATURES]
y = entire_dataset[LABEL]

from sklearn.model_selection import cross_val_score, cross_val_predict

yes_f1_scorer = metrics.make_scorer(metrics.f1_score, pos_label="Yes")

# cross_val_score

In [9]:
scores = cross_val_score(clf_logistic_regression, X, y, cv=5, scoring=yes_f1_scorer, n_jobs=-1)
print(scores)
scores = pandas.Series(scores)
print(f'Min: {scores.min()}, Mean: {scores.mean()}, Max: {scores.max()}')

[0.53846154 0.44444444 0.5        0.61538462 0.74193548]
Min: 0.4444444444444445, Mean: 0.5680452164323132, Max: 0.7419354838709677


In [10]:
scores = cross_val_score(clf_svc, X, y, cv=5, scoring=yes_f1_scorer, n_jobs=-1)
print(scores)
scores = pandas.Series(scores)
print(f'Min: {scores.min()}, Mean: {scores.mean()}, Max: {scores.max()}')

[0.52830189 0.41860465 0.59649123 0.62745098 0.73015873]
Min: 0.4186046511627907, Mean: 0.5802014953152612, Max: 0.7301587301587301


In [11]:
scores = cross_val_score(clf_decision_tree X, y, cv=5, scoring=yes_f1_scorer, n_jobs=-1)
print(scores)
scores = pandas.Series(scores)
print(f'Min: {scores.min()}, Mean: {scores.mean()}, Max: {scores.max()}')

SyntaxError: invalid syntax (<ipython-input-11-1e69d8bd8a39>, line 1)

In [12]:
scores = cross_val_score(clf_multinominal_nb, X, y, cv=5, scoring=yes_f1_scorer, n_jobs=-1)
print(scores)
scores = pandas.Series(scores)
print(f'Min: {scores.min()}, Mean: {scores.mean()}, Max: {scores.max()}')

[0.06451613 0.12903226 0.06666667 0.12903226 0.12903226]
Min: 0.06451612903225806, Mean: 0.10365591397849463, Max: 0.12903225806451613


In [13]:
scores = cross_val_score(clf_sgd, X, y, cv=5, scoring=yes_f1_scorer, n_jobs=-1)
print(scores)
scores = pandas.Series(scores)
print(f'Min: {scores.min()}, Mean: {scores.mean()}, Max: {scores.max()}')

[0.57692308 0.48979592 0.5        0.68965517 0.73684211]
Min: 0.4897959183673469, Mean: 0.598643254593475, Max: 0.736842105263158


# cross_val_predict

In [14]:
y_pred = cross_val_predict(clf_logistic_regression, X, y, cv=5, n_jobs=-1)
print(metrics.classification_report(y, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.89596273 0.92320000 0.90937746       625
         Yes  0.61904762 0.53793103 0.57564576       145

    accuracy                      0.85064935       770
   macro avg  0.75750518 0.73056552 0.74251161       770
weighted avg  0.84381638 0.85064935 0.84653188       770



In [15]:
y_pred = cross_val_predict(clf_svc, X, y, cv=5, n_jobs=-1)
print(metrics.classification_report(y, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.89814815 0.93120000 0.91437549       625
         Yes  0.64754098 0.54482759 0.59176030       145

    accuracy                      0.85844156       770
   macro avg  0.77284457 0.73801379 0.75306790       770
weighted avg  0.85095589 0.85844156 0.85362328       770



In [16]:
y_pred = cross_val_predict(clf_decision_tree, X, y, cv=5, n_jobs=-1)
print(metrics.classification_report(y, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.88593750 0.90720000 0.89644269       625
         Yes  0.55384615 0.49655172 0.52363636       145

    accuracy                      0.82987013       770
   macro avg  0.71989183 0.70187586 0.71003953       770
weighted avg  0.82340082 0.82987013 0.82623890       770



In [17]:
y_pred = cross_val_predict(clf_multinominal_nb, X, y, cv=5, n_jobs=-1)
print(metrics.classification_report(y, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.81997372 0.99840000 0.90043290       625
         Yes  0.88888889 0.05517241 0.10389610       145

    accuracy                      0.82077922       770
   macro avg  0.85443130 0.52678621 0.50216450       770
weighted avg  0.83295125 0.82077922 0.75043571       770



In [18]:
y_pred = cross_val_predict(clf_sgd, X, y, cv=5, n_jobs=-1)
print(metrics.classification_report(y, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.89531250 0.91680000 0.90592885       625
         Yes  0.60000000 0.53793103 0.56727273       145

    accuracy                      0.84545455       770
   macro avg  0.74765625 0.72736552 0.73660079       770
weighted avg  0.83970170 0.84545455 0.84215595       770

