# Loading the Dataset

In [1]:
from array import array
from pathlib import Path

import confuse
import pandas

from classifier import DialogueActClassifierFactory
from pandas import DataFrame
from sklearn import metrics

cfg = confuse.LazyConfig('ccc4prc', __name__)
# Add overrides on top of config.yaml for the workspace.
cfg.set_file('./config.workspace.yaml')

dac_factory = DialogueActClassifierFactory()
dac_labels = dac_factory.get_classifier(
    Path(cfg['dialogue_act_classification']['classifier_file'].as_filename()),
    cfg['dialogue_act_classification']['test_set_percentage'].as_number()).labels()

dataset_dir = Path(cfg['machine_learning']['labeled_seed_excel_file'].as_filename()).parent
training_dataset_file = dataset_dir / ('training_dataset.csv')
test_dataset_file = dataset_dir / ('test_dataset.csv')

training_dataset = pandas.read_csv(training_dataset_file)
test_dataset = pandas.read_csv(test_dataset_file)

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'code_comprehension_related'

entire_dataset = pandas.concat([training_dataset, test_dataset], axis=0).reset_index(drop=True)

# Determine the Machine Learning Algorithm

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

is_author_categories = [
    False,  # 0 should come before 1 for numerical columns.
    True
]

from nlp import LemmaTokenizer

column_transformer = ColumnTransformer(
    transformers=[
        (
            'body_tdidf_vectorizer',
            TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', ngram_range=(1, 2)),
            'body'
        ),
        (
            'dac_transformer',
            OneHotEncoder(categories=[dac_labels]),
            ['dialogue_act_classification_ml']
        ),
        (
            'is_author_transformer',
            OneHotEncoder(categories=[is_author_categories]),
            ['comment_is_by_author']
        ),
    ],
    transformer_weights={
        'body_tdidf_vectorizer': 4,
        'dac_transformer': 1,
        'is_author_transformer': 2,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', LogisticRegression(C=500000, solver='lbfgs'))],
    verbose=False)

clf_logistic_regression = full_pipeline

# Performance with splitting into Training and Test Dataset

In [3]:
X_train = training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = training_dataset[LABEL]
y_true = test_dataset[LABEL]

clf_logistic_regression.fit(X_train, y_train)
y_pred = clf_logistic_regression.predict(X_test)

print(metrics.classification_report(y_true, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.92879257 0.95238095 0.94043887       315
         Yes  0.75806452 0.67142857 0.71212121        70

    accuracy                      0.90129870       385
   macro avg  0.84342854 0.81190476 0.82628004       385
weighted avg  0.89775111 0.90129870 0.89892657       385



# Performance with an Entire Dataset and Cross-Validation

In [9]:
X = entire_dataset[FEATURES]
y = entire_dataset[LABEL]

from sklearn.model_selection import cross_val_score, cross_val_predict

yes_f1_scorer = metrics.make_scorer(metrics.f1_score, pos_label="Yes")

scores = cross_val_score(clf_logistic_regression, X, y, cv=5, scoring=yes_f1_scorer, n_jobs=-1)
print(scores)
scores = pandas.Series(scores)
print(f'Min: {scores.min()}, Mean: {scores.mean()}, Max: {scores.max()}')

[0.53846154 0.44444444 0.5        0.61538462 0.74193548]
Min: 0.4444444444444445, Mean: 0.5680452164323132, Max: 0.7419354838709677


In [10]:
y_pred = cross_val_predict(clf_logistic_regression, X, y, cv=5, n_jobs=-1)
print(metrics.classification_report(y, y_pred, digits=8))

precision    recall  f1-score   support

          No  0.89596273 0.92320000 0.90937746       625
         Yes  0.61904762 0.53793103 0.57564576       145

    accuracy                      0.85064935       770
   macro avg  0.75750518 0.73056552 0.74251161       770
weighted avg  0.84381638 0.85064935 0.84653188       770

