# Setup

In [56]:
from pathlib import Path

import confuse
import pandas

from dialogueactclassification import Classifier

cfg = confuse.LazyConfig('ccc4prc', __name__)
# Add overrides on top of config.yaml for the workspace.
cfg.set_file('./config.workspace.yaml')

dataset_dir = Path(cfg['machine_learning']['labeled_seed_excel_file'].as_filename()).parent
training_dataset_file = dataset_dir / ('training_dataset.csv')
test_dataset_file = dataset_dir / ('test_dataset.csv')

training_dataset = pandas.read_csv(training_dataset_file)
test_dataset = pandas.read_csv(test_dataset_file)

In [57]:
from array import array
from pathlib import Path

import numpy
import pandas
from matplotlib import pyplot
from pandas import DataFrame
from sklearn import metrics

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'code_comprehension_related'

# Original Classifier Model Performance

In [58]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

one_hot_encoder_categories = [
    [
        'Accept',
        'Bye',
        'Clarify',
        'Continuer',
        'Emotion',
        'Emphasis',
        'Greet',
        'Other',
        'Reject',
        'Statement',
        'System',
        'whQuestion',
        'yAnswer',
        'nAnswer',
        'ynQuestion'
    ],
    [
        False,  # 0 should come before 1 for numerical columns.
        True
    ]
]

column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_pipeline', CountVectorizer(stop_words='english'), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_pipeline': 1.0,
        'categorical_transformer': 1.0,
    },
    verbose=False)

classifier = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', SVC(kernel='linear', probability=True))],
    verbose=False)

X_train = training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = training_dataset[LABEL]
y_true = test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.82781457 0.91240876 0.86805556       137
         Yes  0.53846154 0.35000000 0.42424242        40

    accuracy                      0.78531073       177
   macro avg  0.68313805 0.63120438 0.64614899       177
weighted avg  0.76242405 0.78531073 0.76775880       177



# Evenly distribute the training data labelled as "No" and "Yes".

In [59]:
min_label_count = y_train.value_counts().min()
labelled_yes = training_dataset.loc[training_dataset[LABEL] == 'Yes'].head(min_label_count)
labelled_no = training_dataset.loc[training_dataset[LABEL] == 'No'].head(min_label_count)
evenly_distrubted_training_dataset = pandas.concat([labelled_yes, labelled_no])

# Shuffle the order, in order to properly train the model.
evenly_distrubted_training_dataset = evenly_distrubted_training_dataset.sample(frac=1)

# Show the datasets value counts
print(f'Training DataSet - Label "Yes": {len(training_dataset.loc[training_dataset[LABEL] == "Yes"])} v.s. "No": {len(training_dataset.loc[training_dataset[LABEL] == "No"])}, ratio: {len(training_dataset.loc[training_dataset[LABEL] == "Yes"])/len(training_dataset.loc[training_dataset[LABEL] == "No"])}')
print(f'Test Dataset - Label "Yes": {len(test_dataset.loc[test_dataset[LABEL] == "Yes"])} v.s. "No": {len(test_dataset.loc[test_dataset[LABEL] == "No"])}, ratio: {len(test_dataset.loc[test_dataset[LABEL] == "Yes"])/len(test_dataset.loc[test_dataset[LABEL] == "No"])}')
print(f'Training DataSet (evenly distributed) - Label "Yes": {len(evenly_distrubted_training_dataset.loc[evenly_distrubted_training_dataset[LABEL] == "Yes"])} v.s. "No": {len(evenly_distrubted_training_dataset.loc[evenly_distrubted_training_dataset[LABEL] == "No"])}')

Training DataSet - Label "Yes": 67 v.s. "No": 271, ratio: 0.24723247232472326
Test Dataset - Label "Yes": 40 v.s. "No": 137, ratio: 0.291970802919708
Training DataSet (evenly distributed) - Label "Yes": 67 v.s. "No": 67


In [60]:
# Performance after evenly distributed only the training dataset with labels "No" and "Yes".
X_train = evenly_distrubted_training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = evenly_distrubted_training_dataset[LABEL]
y_true = test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.92682927 0.55474453 0.69406393       137
         Yes  0.35789474 0.85000000 0.50370370        40

    accuracy                      0.62146893       177
   macro avg  0.64236200 0.70237226 0.59888382       177
weighted avg  0.79825649 0.62146893 0.65104467       177



In [61]:
%%script false --no-raise-error
# Performance after evenly distributed both the training and test datasets with labels "No" and "Yes".
# But in practice, the incoming data is likely to be skewed to "No", so this is experiment only.
# Evenly distribute the test data labelled as "No" and "Yes".
min_label_count = y_true.value_counts().min()
labelled_yes = test_dataset.loc[test_dataset[LABEL] == 'Yes'].head(min_label_count)
labelled_no = test_dataset.loc[test_dataset[LABEL] == 'No'].head(min_label_count)
evenly_distrubted_test_dataset = pandas.concat([labelled_yes, labelled_no])

X_train = evenly_distrubted_training_dataset[FEATURES]
X_test = evenly_distrubted_test_dataset[FEATURES]
y_train = evenly_distrubted_training_dataset[LABEL]
y_true = evenly_distrubted_test_dataset[LABEL]

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

# Tuning - using a different vectorization - TDIDF

In [62]:
# Setup
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

one_hot_encoder_categories = [
    [
        'Accept',
        'Bye',
        'Clarify',
        'Continuer',
        'Emotion',
        'Emphasis',
        'Greet',
        'Other',
        'Reject',
        'Statement',
        'System',
        'whQuestion',
        'yAnswer',
        'nAnswer',
        'ynQuestion'
    ],
    [
        False,  # 0 should come before 1 for numerical columns.
        True
    ]
]

X_train = evenly_distrubted_training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = evenly_distrubted_training_dataset[LABEL]
y_true = test_dataset[LABEL]

In [63]:
column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_vectorizer', TfidfVectorizer(stop_words=None, ngram_range=(2, 2)), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
        # ('author_transformer', OneHotEncoder(categories=[[False, True]]), ['comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_vectorizer': 1,
        'categorical_transformer': 3.8,
        # 'author_transformer': 0,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', SVC(kernel='linear', C=1.2, probability=True))],
    verbose=False)

classifier = full_pipeline

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.94382022 0.61313869 0.74336283       137
         Yes  0.39772727 0.87500000 0.54687500        40

    accuracy                      0.67231638       177
   macro avg  0.67077375 0.74406934 0.64511892       177
weighted avg  0.82040939 0.67231638 0.69895880       177



# Tuning - add Stemming

In [64]:
from nltk.stem.snowball import SnowballStemmer

class StemmedCountVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        stemmer = SnowballStemmer('english', ignore_stopwords=True)
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_vectorizer', StemmedCountVectorizer(stop_words=None, ngram_range=(2, 2)), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_vectorizer': 1,
        'categorical_transformer': 3.8,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', SVC(kernel='linear', C=1.2, probability=True))],
    verbose=False)

classifier = full_pipeline

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.94382022 0.61313869 0.74336283       137
         Yes  0.39772727 0.87500000 0.54687500        40

    accuracy                      0.67231638       177
   macro avg  0.67077375 0.74406934 0.64511892       177
weighted avg  0.82040939 0.67231638 0.69895880       177



# Tuning - try adding Lemmatization

In [65]:
from nlp import LemmaTokenizer

column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_vectorizer', TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words=None, ngram_range=(1, 2)), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_vectorizer': 3,
        'categorical_transformer': 1,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', SVC(kernel='linear', C=1.2, probability=True))],
    verbose=False)

classifier = full_pipeline

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.96039604 0.70802920 0.81512605       137
         Yes  0.47368421 0.90000000 0.62068966        40

    accuracy                      0.75141243       177
   macro avg  0.71704013 0.80401460 0.71790785       177
weighted avg  0.85040467 0.75141243 0.77118562       177



# Tuning - try using MultinominalNB

In [66]:
from sklearn.naive_bayes import MultinomialNB

column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_vectorizer', TfidfVectorizer(stop_words=None, ngram_range=(2, 2)), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_vectorizer': 1,
        'categorical_transformer': 10,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', MultinomialNB(fit_prior=True))],
    verbose=False)

classifier = full_pipeline

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.92857143 0.56934307 0.70588235       137
         Yes  0.36559140 0.85000000 0.51127820        40

    accuracy                      0.63276836       177
   macro avg  0.64708141 0.70967153 0.60858027       177
weighted avg  0.80134430 0.63276836 0.66190401       177



# Tuning - try using Logistic Regression

In [67]:
from sklearn.linear_model import LogisticRegression

column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_vectorizer', TfidfVectorizer(stop_words=None, ngram_range=(2, 2)), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_vectorizer': 1,
        'categorical_transformer': 10,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', LogisticRegression())],
    verbose=False)

classifier = full_pipeline

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.92941176 0.57664234 0.71171171       137
         Yes  0.36956522 0.85000000 0.51515152        40

    accuracy                      0.63841808       177
   macro avg  0.64948849 0.71332117 0.61343161       177
weighted avg  0.80289277 0.63841808 0.66729133       177



# Tuning - try using Decision Tree

In [68]:
from sklearn.tree import DecisionTreeClassifier

column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_vectorizer', TfidfVectorizer(stop_words=None, ngram_range=(2, 2)), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_vectorizer': 1,
        'categorical_transformer': 10,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', DecisionTreeClassifier())],
    verbose=False)

classifier = full_pipeline

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.87368421 0.60583942 0.71551724       137
         Yes  0.34146341 0.70000000 0.45901639        40

    accuracy                      0.62711864       177
   macro avg  0.60757381 0.65291971 0.58726682       177
weighted avg  0.75340832 0.62711864 0.65755095       177



# Tuning the model with Grid Search

In [36]:
column_transformer = ColumnTransformer(
    transformers=[
        ('body_bow_vectorizer', TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words=None, ngram_range=(1, 2)), 'body'),
        ('categorical_transformer', OneHotEncoder(categories=one_hot_encoder_categories),
            ['dialogue_act_classification_ml', 'comment_is_by_author']),
    ],
    transformer_weights={
        'body_bow_vectorizer': 1,
        'categorical_transformer': 3,
    },
    verbose=False)

full_pipeline = Pipeline(
    steps=[
        ('preprocessor', column_transformer),
        ('classifier', SVC(kernel='linear', C=1.2, probability=True))],
    verbose=False)

scorer = metrics.make_scorer(metrics.precision_score, pos_label="Yes")

grid_search_cv_params = [
    {
        'preprocessor__transformer_weights': [
            {'body_bow_vectorizer': 1, 'categorical_transformer': 1},
            {'body_bow_vectorizer': 1, 'categorical_transformer': 4},
            {'body_bow_vectorizer': 1, 'categorical_transformer': 6},
            {'body_bow_vectorizer': 1, 'categorical_transformer': 10},
            {'body_bow_vectorizer': 2, 'categorical_transformer': 1},
            {'body_bow_vectorizer': 4, 'categorical_transformer': 1},
            {'body_bow_vectorizer': 6, 'categorical_transformer': 1},
            {'body_bow_vectorizer': 10, 'categorical_transformer': 1},                                      
        ]
    },
    {'preprocessor__body_bow_vectorizer__stop_words': [None, 'english']},
    {'preprocessor__body_bow_vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 5), (2, 2), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5)]},
    {'classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
    {'classifier__C': [0.8, 1, 1.2, 2, 4]},    
]
classifier = GridSearchCV(full_pipeline, param_grid=grid_search_cv_params, cv=5, scoring='accuracy')

classifier.fit(X_train, y_train)
classifier.best_params_

{'preprocessor__transformer_weights': {'body_bow_vectorizer': 4,
  'categorical_transformer': 1}}

In [37]:
y_pred = classifier.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

precision    recall  f1-score   support

          No  0.94174757 0.70802920 0.80833333       137
         Yes  0.45945946 0.85000000 0.59649123        40

    accuracy                      0.74011299       177
   macro avg  0.70060352 0.77901460 0.70241228       177
weighted avg  0.83275591 0.74011299 0.76045941       177

