In [1]:
from classifier import DialogueActClassifierFactory
from joblib import load
from pandas import DataFrame, option_context, read_csv
from pathlib import Path

clf_grid_search_cv = load('./models/program_comprehension_challenge_gridsearchcv.pickle') 
clf_pipeline_decision_tree = load('./models/program_comprehension_challenge_classifier.pickle') 
dac_factory = DialogueActClassifierFactory()
dac_clf = dac_factory.get_classifier(classifier_file=Path('./models/dialogue_act_classifier.pickle'), test_set_percentage=10)

training_dataset = read_csv('../master-of-engineering/Assets/BigQuery/training_dataset.csv')
test_dataset = read_csv('../master-of-engineering/Assets/BigQuery/test_dataset.csv')

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'program_comprehension_challenge'

X_train = training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = training_dataset[LABEL]
y_true = test_dataset[LABEL]

In [2]:
from sklearn import metrics

clf_pipeline_decision_tree.fit(X_train, y_train)
y_pred = clf_pipeline_decision_tree.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)

print(report)

              precision    recall  f1-score   support

          No  0.92857143 0.81981982 0.87081340       111
         Yes  0.61538462 0.82051282 0.70329670        39

    accuracy                      0.82000000       150
   macro avg  0.77197802 0.82016632 0.78705505       150
weighted avg  0.84714286 0.82000000 0.82725906       150



In [4]:
experiment_dataset = read_csv('/Volumes/RamDisk/results_20190503_1403_cleaned.csv')

In [4]:
from pandas import concat

content_analysis_dataset = concat([
    training_dataset.loc[training_dataset['program_comprehension_challenge']=='Yes'], 
    test_dataset.loc[test_dataset['program_comprehension_challenge']=='Yes']
    ])

content_analysis_dataset.to_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv', index=False, header=True, mode='w')

sample_size = 384 - content_analysis_dataset.shape[0] # 95% Confidence Level + 5% Confidence Interval with a population of 1036743 -> Sample Size = 384, already analyzed in training dataset and test dataset.

In [5]:
experiment_dataset_total_rows = experiment_dataset.shape[0]
print(experiment_dataset_total_rows)
indices_already_sampled = list(experiment_dataset.loc[experiment_dataset['comment_id'].isin(content_analysis_dataset['comment_id'])].index)
print(indices_already_sampled)

1036743
[509, 726, 17798, 24155, 32099, 36464, 43032, 54431, 56313, 69258, 73270, 80162, 90511, 105051, 110503, 129628, 132855, 133232, 133516, 163355, 167086, 185939, 189728, 197447, 209669, 214812, 232697, 235018, 246296, 252758, 258339, 259928, 270277, 274453, 284698, 297264, 315545, 330692, 333599, 338031, 340343, 346345, 354605, 364763, 368760, 373140, 380565, 380639, 389432, 403936, 405359, 413884, 424418, 432183, 436822, 456947, 461248, 474321, 506840, 513293, 522487, 526100, 527975, 534546, 536108, 542999, 547634, 565844, 566844, 579035, 595607, 596015, 600716, 609584, 611753, 615305, 619572, 622248, 626921, 632953, 644196, 664099, 668030, 668995, 690642, 701256, 705033, 707035, 715644, 726526, 733106, 752516, 761659, 762103, 762279, 765988, 775082, 776089, 805526, 810905, 813272, 814306, 818502, 818629, 824256, 827388, 836138, 841892, 843924, 857463, 866385, 868061, 886945, 887040, 907182, 916756, 919472, 935851, 947274, 947500, 951413, 956529, 976020, 987354, 990899, 991399, 

In [6]:
from random import choice

experiment_dataset_range = range(0, experiment_dataset_total_rows)
counter_yes = 0
counter_total = 0
while counter_yes < sample_size:
    counter_total += 1
    random_index = choice([i for i in experiment_dataset_range if i not in indices_already_sampled])
    indices_already_sampled.append(random_index)
    row = experiment_dataset.loc[random_index]
    body = row['body']
    comment_is_by_author = row['comment_is_by_author']
    dialogue_act_classification = dac_factory.classify(body)
    prediction = clf_pipeline_decision_tree.predict(
        DataFrame(
            {
                'body': [body], 
                'comment_is_by_author': [comment_is_by_author], 
                'dialogue_act_classification_ml': [dialogue_act_classification]
            }))

    if prediction[0] == 'Yes':
        row['dialogue_act_classification_ml'] = dialogue_act_classification
        row['topic_keywords'] = ''
        row['program_comprehension_challenge'] = prediction[0]
        row['problem_encountered'] = ''
        content_analysis_dataset = content_analysis_dataset.append(row)
        counter_yes += 1  

    print(f'Counter (Total): {counter_total}, Counter (Yes): {counter_yes}, Size of already sampled: {len(indices_already_sampled)}, Random Index: {random_index}')

content_analysis_dataset.to_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv', index=False, header=True, mode='w')

): 686, Counter (Yes): 190, Size of already sampled: 817, Random Index: 388023
Counter (Total): 687, Counter (Yes): 190, Size of already sampled: 818, Random Index: 936853
Counter (Total): 688, Counter (Yes): 191, Size of already sampled: 819, Random Index: 645060
Counter (Total): 689, Counter (Yes): 191, Size of already sampled: 820, Random Index: 9722
Counter (Total): 690, Counter (Yes): 191, Size of already sampled: 821, Random Index: 839795
Counter (Total): 691, Counter (Yes): 192, Size of already sampled: 822, Random Index: 547019
Counter (Total): 692, Counter (Yes): 192, Size of already sampled: 823, Random Index: 905737
Counter (Total): 693, Counter (Yes): 192, Size of already sampled: 824, Random Index: 147694
Counter (Total): 694, Counter (Yes): 193, Size of already sampled: 825, Random Index: 540420
Counter (Total): 695, Counter (Yes): 194, Size of already sampled: 826, Random Index: 16136
Counter (Total): 696, Counter (Yes): 194, Size of already sampled: 827, Random Index: 7

# Resample 1 more row due to inaccurate prediction

In [7]:
content_analysis_dataset = read_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv')
experiment_dataset_total_rows = experiment_dataset.shape[0]
indices_already_sampled = list(experiment_dataset.loc[experiment_dataset['comment_id'].isin(content_analysis_dataset['comment_id'])].index)
print(indices_already_sampled)

[509, 726, 5214, 13246, 15721, 16136, 17798, 18033, 18485, 23560, 24155, 25351, 27372, 31189, 31224, 32099, 36464, 37069, 43032, 46187, 48764, 50359, 54431, 56313, 64978, 68549, 69258, 73270, 73625, 80162, 81554, 81865, 83585, 83869, 87230, 87962, 90511, 92806, 105051, 106425, 110503, 111094, 112501, 120123, 122380, 123414, 124241, 129628, 132855, 133232, 133322, 133516, 139354, 139913, 147488, 148545, 149293, 159932, 163355, 163821, 167086, 167857, 168095, 170801, 171532, 176238, 178384, 178896, 183477, 185939, 187565, 189728, 191399, 192144, 192233, 194173, 194404, 197447, 199034, 201840, 203370, 209669, 214552, 214812, 220419, 221723, 223203, 232697, 235018, 245152, 246296, 251501, 252758, 252787, 258339, 259928, 263737, 270277, 272302, 274453, 279697, 281703, 284698, 287628, 287773, 289801, 297264, 298107, 314753, 315545, 315794, 330692, 333599, 336061, 338031, 338199, 340343, 343384, 345160, 345881, 346345, 346918, 351878, 354162, 354605, 360921, 364763, 368760, 372518, 372560, 37

In [8]:
from random import choice

while True:
    random_index = choice([i for i in range(0, experiment_dataset_total_rows) if i not in indices_already_sampled])
    indices_already_sampled.append(random_index)
    row = experiment_dataset.loc[random_index]
    body = row['body']
    comment_is_by_author = row['comment_is_by_author']
    dialogue_act_classification = dac_factory.classify(body)
    prediction = clf_pipeline_decision_tree.predict(
        DataFrame(
            {
                'body': [body], 
                'comment_is_by_author': [comment_is_by_author], 
                'dialogue_act_classification_ml': [dialogue_act_classification]
            }))

    if prediction[0] == 'Yes':
        row['dialogue_act_classification_ml'] = dialogue_act_classification
        row['topic_keywords'] = ''
        row['program_comprehension_challenge'] = prediction[0]
        row['problem_encountered'] = ''
        content_analysis_dataset = content_analysis_dataset.append(row)
        break

    print(f'Size of already sampled: {len(indices_already_sampled)}, Random Index: {random_index}')

content_analysis_dataset.to_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv', index=False, header=True, mode='w')

Size of already sampled: 385, Random Index: 73260
Size of already sampled: 386, Random Index: 679283
Size of already sampled: 387, Random Index: 137546
Size of already sampled: 388, Random Index: 491776
