In [16]:
from classifier import DialogueActClassifierFactory
from joblib import load
from pandas import DataFrame, option_context, read_csv
from pathlib import Path

pcc_clf = load('./models/program_comprehension_challenge_classifier.pickle') 
dac_factory = DialogueActClassifierFactory()
dac_factory.get_classifier(classifier_file=Path('./models/dialogue_act_classifier.pickle'), test_set_percentage=10)

test_dataset = read_csv('../master-of-engineering/Assets/BigQuery/test_dataset.csv')

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'program_comprehension_challenge'

X_test = test_dataset[FEATURES]
y_true = test_dataset[LABEL]

In [17]:
from sklearn import metrics

y_pred = pcc_clf.predict(X_test)
report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

              precision    recall  f1-score   support

          No  0.88695652 0.91891892 0.90265487       111
         Yes  0.74285714 0.66666667 0.70270270        39

    accuracy                      0.85333333       150
   macro avg  0.81490683 0.79279279 0.80267878       150
weighted avg  0.84949068 0.85333333 0.85066730       150



In [18]:
experiment_dataset = read_csv('/Volumes/RamDisk/results_20190503_1403_cleaned.csv')
experiment_dataset_total_rows = experiment_dataset.shape[0]
print(experiment_dataset_total_rows)

1036743


In [19]:
content_analysis_dataset = read_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv')
indices_already_sampled = list(experiment_dataset.loc[experiment_dataset['comment_id'].isin(content_analysis_dataset['comment_id'])].index)
print(indices_already_sampled)

[509, 726, 5155, 6263, 8100, 9837, 13301, 13701, 17596, 17798, 24155, 28557, 32099, 36464, 40634, 41192, 41333, 41958, 43032, 45140, 48764, 54431, 56313, 59456, 61060, 63502, 65612, 67282, 67466, 69258, 70808, 73270, 80162, 80998, 90511, 105051, 108957, 110483, 110503, 116450, 129628, 131264, 132855, 133232, 133516, 135587, 146050, 146853, 148757, 150592, 160426, 163355, 164728, 167086, 167931, 168984, 173525, 178384, 180627, 185363, 185939, 186070, 186924, 188374, 189728, 191929, 196339, 197341, 197447, 197645, 198210, 201840, 203269, 209669, 214812, 218597, 219589, 222980, 226954, 227514, 227854, 228884, 232697, 234579, 235018, 246296, 248600, 252758, 258339, 259236, 259657, 259829, 259928, 262816, 266850, 270277, 272491, 272499, 274453, 276299, 281703, 283658, 284698, 288057, 294139, 297264, 298021, 302694, 303594, 306570, 312405, 312712, 315270, 315545, 318150, 324654, 325582, 330692, 331449, 331514, 333599, 335101, 338031, 340239, 340343, 342352, 344104, 346345, 346493, 349093, 34

# Resample more rows due to inaccurate prediction

In [20]:
from random import choice

sample_size = 70
experiment_dataset_range = range(0, experiment_dataset_total_rows)
counter_yes = 0
counter_total = 0
while counter_yes < sample_size:
    counter_total += 1
    random_index = choice([i for i in experiment_dataset_range if i not in indices_already_sampled])
    indices_already_sampled.append(random_index)
    row = experiment_dataset.loc[random_index]
    body = row['body']
    comment_is_by_author = row['comment_is_by_author']
    dialogue_act_classification = dac_factory.classify(body)
    prediction = pcc_clf.predict(
        DataFrame(
            {
                'body': [body], 
                'comment_is_by_author': [comment_is_by_author], 
                'dialogue_act_classification_ml': [dialogue_act_classification]
            }))

    if prediction[0] == 'Yes':
        row['dialogue_act_classification_ml'] = dialogue_act_classification
        row['topic_keywords'] = ''
        row['program_comprehension_challenge'] = prediction[0]
        row['problem_encountered'] = ''
        content_analysis_dataset = content_analysis_dataset.append(row)
        counter_yes += 1  

    print(f'Counter (Total): {counter_total}, Counter (Yes): {counter_yes}, Size of already sampled: {len(indices_already_sampled)}, Random Index: {random_index}')

content_analysis_dataset.to_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv', index=False, header=True, mode='w')

0, Size of already sampled: 535, Random Index: 556171
Counter (Total): 139, Counter (Yes): 30, Size of already sampled: 536, Random Index: 107543
Counter (Total): 140, Counter (Yes): 30, Size of already sampled: 537, Random Index: 371138
Counter (Total): 141, Counter (Yes): 30, Size of already sampled: 538, Random Index: 589208
Counter (Total): 142, Counter (Yes): 30, Size of already sampled: 539, Random Index: 9387
Counter (Total): 143, Counter (Yes): 30, Size of already sampled: 540, Random Index: 606942
Counter (Total): 144, Counter (Yes): 30, Size of already sampled: 541, Random Index: 319479
Counter (Total): 145, Counter (Yes): 30, Size of already sampled: 542, Random Index: 813627
Counter (Total): 146, Counter (Yes): 30, Size of already sampled: 543, Random Index: 521271
Counter (Total): 147, Counter (Yes): 30, Size of already sampled: 544, Random Index: 987238
Counter (Total): 148, Counter (Yes): 30, Size of already sampled: 545, Random Index: 521598
Counter (Total): 149, Counte