In [49]:
# Setup
from pathlib import Path

import confuse
import pandas

from dialogueactclassification import Classifier
from ml import MachineLearning

cfg = confuse.LazyConfig('ccc4prc', __name__)
# Add overrides on top of config.yaml for the workspace.
cfg.set_file('./config.workspace.yaml')

labeled_seed_excel_file = cfg['machine_learning']['labeled_seed_excel_file'].as_filename()
dataset_dir = Path(labeled_seed_excel_file).parent
training_dataset_file = dataset_dir / ('training_dataset.csv')
test_dataset_file = dataset_dir / ('test_dataset.csv')

training_dataset = pandas.read_csv(training_dataset_file)
test_dataset = pandas.read_csv(test_dataset_file)

unlabeled_dataset = pandas.read_csv(cfg['machine_learning']['unlabeled_csv_file'].as_filename())

In [50]:
# Setup
from array import array
from pathlib import Path

import numpy
import pandas
from matplotlib import pyplot
from pandas import DataFrame
from sklearn import metrics
from sklearn.model_selection import train_test_split

from classifier import CodeComprehensionClassifierFactory

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'code_comprehension_related'

classifier = CodeComprehensionClassifierFactory.get_classifier()

In [51]:
# Original Classifier Model Performance
X_train = training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = training_dataset[LABEL]
y_true = test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.85135135 0.91304348 0.88111888       138
         Yes  0.58620690 0.43589744 0.50000000        39

    accuracy                      0.80790960       177
   macro avg  0.71877912 0.67447046 0.69055944       177
weighted avg  0.79292969 0.80790960 0.79714353       177



In [52]:
# Show the training dataset value counts
len(training_dataset.loc[training_dataset[LABEL] == 'Yes'])

71

In [53]:
# Show the training dataset value counts
len(training_dataset.loc[training_dataset[LABEL] == 'No'])

237

In [54]:
# Show the test dataset value counts
len(test_dataset.loc[training_dataset[LABEL] == 'Yes'])

42

In [55]:
# Show the test dataset value counts
len(test_dataset.loc[training_dataset[LABEL] == 'No'])

135

In [56]:
# Evenly distribute the training data labelled as "No" and "Yes".
min_label_count = y_train.value_counts().min()
labelled_yes = training_dataset.loc[training_dataset[LABEL] == 'Yes'].head(min_label_count)
labelled_no = training_dataset.loc[training_dataset[LABEL] == 'No'].head(min_label_count)
evenly_distrubted_training_dataset = pandas.concat([labelled_yes, labelled_no])

# Shuffle the order, in order to properly train the model.
evenly_distrubted_training_dataset = evenly_distrubted_training_dataset.sample(frac=1)

In [57]:
# Evenly distribute the test data labelled as "No" and "Yes".
min_label_count = y_true.value_counts().min()
labelled_yes = test_dataset.loc[test_dataset[LABEL] == 'Yes'].head(min_label_count)
labelled_no = test_dataset.loc[test_dataset[LABEL] == 'No'].head(min_label_count)
evenly_distrubted_test_dataset = pandas.concat([labelled_yes, labelled_no])

In [58]:
# Performance after evenly distributed only the training dataset with labels "No" and "Yes".
X_train = evenly_distrubted_training_dataset[FEATURES]
y_train = evenly_distrubted_training_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.91566265 0.55072464 0.68778281       138
         Yes  0.34042553 0.82051282 0.48120301        39

    accuracy                      0.61016949       177
   macro avg  0.62804409 0.68561873 0.58449291       177
weighted avg  0.78891549 0.61016949 0.64226522       177



In [59]:
# Performance after evenly distributed both the training and test datasets with labels "No" and "Yes".
X_train = evenly_distrubted_training_dataset[FEATURES]
X_test = evenly_distrubted_test_dataset[FEATURES]
y_train = evenly_distrubted_training_dataset[LABEL]
y_true = evenly_distrubted_test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.73076923 0.48717949 0.58461538        39
         Yes  0.61538462 0.82051282 0.70329670        39

    accuracy                      0.65384615        78
   macro avg  0.67307692 0.65384615 0.64395604        78
weighted avg  0.67307692 0.65384615 0.64395604        78

