# Setup

In [1]:
from pathlib import Path

import confuse
import pandas

from dialogueactclassification import Classifier

cfg = confuse.LazyConfig('ccc4prc', __name__)
# Add overrides on top of config.yaml for the workspace.
cfg.set_file('./config.workspace.yaml')

dataset_dir = Path(cfg['machine_learning']['labeled_seed_excel_file'].as_filename()).parent
training_dataset_file = dataset_dir / ('training_dataset.csv')
test_dataset_file = dataset_dir / ('test_dataset.csv')

training_dataset = pandas.read_csv(training_dataset_file)
test_dataset = pandas.read_csv(test_dataset_file)

In [2]:
from array import array
from pathlib import Path

import numpy
import pandas
from matplotlib import pyplot
from pandas import DataFrame
from sklearn import metrics
from sklearn.model_selection import train_test_split

from classifier import CodeComprehensionClassifierFactory

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'code_comprehension_related'

classifier = CodeComprehensionClassifierFactory.get_classifier()

# Original Classifier Model Performance

In [3]:
X_train = training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = training_dataset[LABEL]
y_true = test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.82550336 0.90441176 0.86315789       136
         Yes  0.53571429 0.36585366 0.43478261        41

    accuracy                      0.77966102       177
   macro avg  0.68060882 0.63513271 0.64897025       177
weighted avg  0.75837707 0.77966102 0.76392972       177



# Evenly distribute the training data labelled as "No" and "Yes".

In [4]:
min_label_count = y_train.value_counts().min()
labelled_yes = training_dataset.loc[training_dataset[LABEL] == 'Yes'].head(min_label_count)
labelled_no = training_dataset.loc[training_dataset[LABEL] == 'No'].head(min_label_count)
evenly_distrubted_training_dataset = pandas.concat([labelled_yes, labelled_no])

# Shuffle the order, in order to properly train the model.
evenly_distrubted_training_dataset = evenly_distrubted_training_dataset.sample(frac=1)

# Show the datasets value counts
print(f'Training DataSet - Label "Yes": {len(training_dataset.loc[training_dataset[LABEL] == "Yes"])} v.s. "No": {len(training_dataset.loc[training_dataset[LABEL] == "No"])}, ratio: {len(training_dataset.loc[training_dataset[LABEL] == "Yes"])/len(training_dataset.loc[training_dataset[LABEL] == "No"])}')
print(f'Test Dataset - Label "Yes": {len(test_dataset.loc[test_dataset[LABEL] == "Yes"])} v.s. "No": {len(test_dataset.loc[test_dataset[LABEL] == "No"])}, ratio: {len(test_dataset.loc[test_dataset[LABEL] == "Yes"])/len(test_dataset.loc[test_dataset[LABEL] == "No"])}')
print(f'Training DataSet (evenly distributed) - Label "Yes": {len(evenly_distrubted_training_dataset.loc[evenly_distrubted_training_dataset[LABEL] == "Yes"])} v.s. "No": {len(evenly_distrubted_training_dataset.loc[evenly_distrubted_training_dataset[LABEL] == "No"])}')

Training DataSet - Label "Yes": 76 v.s. "No": 262, ratio: 0.2900763358778626
Test Dataset - Label "Yes": 41 v.s. "No": 136, ratio: 0.3014705882352941
Training DataSet (evenly distributed) - Label "Yes": 76 v.s. "No": 76


In [5]:
# Performance after evenly distributed only the training dataset with labels "No" and "Yes".
X_train = evenly_distrubted_training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = evenly_distrubted_training_dataset[LABEL]
y_true = test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.90000000 0.59558824 0.71681416       136
         Yes  0.36781609 0.78048780 0.50000000        41

    accuracy                      0.63841808       177
   macro avg  0.63390805 0.68803802 0.60840708       177
weighted avg  0.77672576 0.63841808 0.66659167       177



In [6]:
%%script false --no-raise-error
# Performance after evenly distributed both the training and test datasets with labels "No" and "Yes".
# But in practice, the incoming data is likely to be skewed to "No", so this is experiment only.
# Evenly distribute the test data labelled as "No" and "Yes".
min_label_count = y_true.value_counts().min()
labelled_yes = test_dataset.loc[test_dataset[LABEL] == 'Yes'].head(min_label_count)
labelled_no = test_dataset.loc[test_dataset[LABEL] == 'No'].head(min_label_count)
evenly_distrubted_test_dataset = pandas.concat([labelled_yes, labelled_no])

X_train = evenly_distrubted_training_dataset[FEATURES]
X_test = evenly_distrubted_test_dataset[FEATURES]
y_train = evenly_distrubted_training_dataset[LABEL]
y_true = evenly_distrubted_test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

# Try using a different vectorization - TDIDF

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

one_hot_encoder_categories = [
    [
        'Accept',
        'Bye',
        'Clarify',
        'Continuer',
        'Emotion',
        'Emphasis',
        'Greet',
        'Other',
        'Reject',
        'Statement',
        'System',
        'whQuestion',
        'yAnswer',
        'nAnswer',
        'ynQuestion'
    ],
    [
        False,  # 0 should come before 1 for numerical columns.
        True
    ]
]

column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_vectorizer', TfidfVectorizer(stop_words=None, ngram_range=(2, 2)), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_vectorizer': 1,
        'categorical_transformer': 10,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', SVC(kernel='linear', C=2, probability=True))],
    verbose=False)

classifier = full_pipeline

In [8]:
X_train = evenly_distrubted_training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = evenly_distrubted_training_dataset[LABEL]
y_true = test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.92631579 0.64705882 0.76190476       136
         Yes  0.41463415 0.82926829 0.55284553        41

    accuracy                      0.68926554       177
   macro avg  0.67047497 0.73816356 0.65737515       177
weighted avg  0.80779066 0.68926554 0.71347861       177



# Tuning the model with Grid Search

In [9]:
scorer = metrics.make_scorer(metrics.precision_score, pos_label="Yes")

grid_search_cv_params = [
    {
        'preprocessor__transformer_weights': [
            {'body_bow_vectorizer': 1, 'categorical_transformer': 1},
            {'body_bow_vectorizer': 1, 'categorical_transformer': 2},
            {'body_bow_vectorizer': 1, 'categorical_transformer': 4},
            {'body_bow_vectorizer': 1, 'categorical_transformer': 6},
            {'body_bow_vectorizer': 1, 'categorical_transformer': 10},
            {'body_bow_vectorizer': 2, 'categorical_transformer': 1},
            {'body_bow_vectorizer': 4, 'categorical_transformer': 1},
            {'body_bow_vectorizer': 6, 'categorical_transformer': 1},
            {'body_bow_vectorizer': 10, 'categorical_transformer': 1},                                      
        ]
    },
    {'preprocessor__body_bow_vectorizer__stop_words': [None, 'english']},
    {'preprocessor__body_bow_vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (2, 2), (2, 3), (2, 4), (2, 5), (3, 3), (3, 4), (3, 5)]},
    {'classifier__kernel': ['linear', 'rbf']},
    {'classifier__C': [0.1, 0.5, 1, 2, 4, 8, 16, 32]},    
]
classifier = GridSearchCV(full_pipeline, param_grid=grid_search_cv_params, cv=5, scoring=scorer)

classifier.fit(X_train, y_train)
classifier.best_params_

{'preprocessor__transformer_weights': {'body_bow_vectorizer': 1,
  'categorical_transformer': 2}}

In [10]:
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.91666667 0.64705882 0.75862069       136
         Yes  0.40740741 0.80487805 0.54098361        41

    accuracy                      0.68361582       177
   macro avg  0.66203704 0.72596844 0.64980215       177
weighted avg  0.79870266 0.68361582 0.70820758       177

