In [1]:
from classifier import DialogueActClassifierFactory
from joblib import load
from pandas import DataFrame, option_context, read_csv
from pathlib import Path

pcc_clf = load('./models/program_comprehension_challenge_classifier.pickle') 
dac_factory = DialogueActClassifierFactory()
dac_clf = dac_factory.get_classifier(classifier_file=Path('./models/dialogue_act_classifier.pickle'), test_set_percentage=10)

training_dataset = read_csv('../master-of-engineering/Assets/BigQuery/training_dataset.csv')
test_dataset = read_csv('../master-of-engineering/Assets/BigQuery/test_dataset.csv')

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'program_comprehension_challenge'

X_train = training_dataset[FEATURES]
X_test = test_dataset[FEATURES]
y_train = training_dataset[LABEL]
y_true = test_dataset[LABEL]

In [2]:
from sklearn import metrics

y_pred = pcc_clf.predict(X_test)

report = metrics.classification_report(y_true, y_pred, digits=8)

print(report)

              precision    recall  f1-score   support

          No  0.88695652 0.91891892 0.90265487       111
         Yes  0.74285714 0.66666667 0.70270270        39

    accuracy                      0.85333333       150
   macro avg  0.81490683 0.79279279 0.80267878       150
weighted avg  0.84949068 0.85333333 0.85066730       150



In [3]:
# content_analysis_dataset = read_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv')
# content_analysis_dataset['program_comprehension_challenge_new'] = ''

# for idx, row in content_analysis_dataset.iterrows():
#     body = row['body']
#     comment_is_by_author = row['comment_is_by_author']
#     dialogue_act_classification = row['dialogue_act_classification_ml']
#     prediction = pcc_clf.predict(
#         DataFrame(
#             {
#                 'body': [body], 
#                 'comment_is_by_author': [comment_is_by_author], 
#                 'dialogue_act_classification_ml': [dialogue_act_classification]
#             }))

#     row['program_comprehension_challenge_new'] = prediction[0]
#     content_analysis_dataset.loc[idx] = row

# content_analysis_dataset.to_csv('/Volumes/RamDisk/content_analysis_dataset_new.csv', index=False, header=True, mode='w')

In [4]:
experiment_dataset = read_csv('/Volumes/RamDisk/results_20190503_1403_cleaned.csv')

In [5]:
from pandas import concat

content_analysis_dataset = concat([
    training_dataset.loc[training_dataset['program_comprehension_challenge']=='Yes'], 
    test_dataset.loc[test_dataset['program_comprehension_challenge']=='Yes']
    ])

sample_size = 384 - content_analysis_dataset.shape[0] # 95% Confidence Level + 5% Confidence Interval with a population of 1036743 -> Sample Size = 384, already analyzed in training dataset and test dataset.
sample_size

253

In [6]:
experiment_dataset_total_rows = experiment_dataset.shape[0]
print(experiment_dataset_total_rows)
indices_already_sampled = list(experiment_dataset.loc[experiment_dataset['comment_id'].isin(content_analysis_dataset['comment_id'])].index)
print(indices_already_sampled)

1036743
[509, 726, 17798, 24155, 32099, 36464, 43032, 54431, 56313, 69258, 73270, 80162, 90511, 105051, 110503, 129628, 132855, 133232, 133516, 163355, 167086, 185939, 189728, 197447, 209669, 214812, 232697, 235018, 246296, 252758, 258339, 259928, 270277, 274453, 284698, 297264, 315545, 330692, 333599, 338031, 340343, 346345, 354605, 364763, 368760, 373140, 380565, 380639, 389432, 403936, 405359, 413884, 424418, 432183, 436822, 456947, 461248, 474321, 506840, 513293, 522487, 526100, 527975, 534546, 536108, 542999, 547634, 565844, 566844, 579035, 595607, 596015, 600716, 609584, 611753, 615305, 619572, 622248, 626921, 632953, 644196, 664099, 668030, 668995, 690642, 701256, 705033, 707035, 715644, 726526, 733106, 752516, 761659, 762103, 762279, 765988, 775082, 776089, 805526, 810905, 813272, 814306, 818502, 818629, 824256, 827388, 836138, 841892, 843924, 857463, 866385, 868061, 886945, 887040, 907182, 916756, 919472, 935851, 947274, 947500, 951413, 956529, 976020, 987354, 990899, 991399, 

In [7]:
from random import choice

experiment_dataset_range = range(0, experiment_dataset_total_rows)
counter_yes = 0
counter_total = 0
while counter_yes < sample_size:
    counter_total += 1
    random_index = choice([i for i in experiment_dataset_range if i not in indices_already_sampled])
    indices_already_sampled.append(random_index)
    row = experiment_dataset.loc[random_index]
    body = row['body']
    comment_is_by_author = row['comment_is_by_author']
    dialogue_act_classification = dac_factory.classify(body)
    prediction = pcc_clf.predict(
        DataFrame(
            {
                'body': [body], 
                'comment_is_by_author': [comment_is_by_author], 
                'dialogue_act_classification_ml': [dialogue_act_classification]
            }))

    if prediction[0] == 'Yes':
        row['dialogue_act_classification_ml'] = dialogue_act_classification
        row['topic_keywords'] = ''
        row['program_comprehension_challenge'] = prediction[0]
        row['problem_encountered'] = ''
        content_analysis_dataset = content_analysis_dataset.append(row)
        counter_yes += 1  

    print(f'Counter (Total): {counter_total}, Counter (Yes): {counter_yes}, Size of already sampled: {len(indices_already_sampled)}, Random Index: {random_index}')

content_analysis_dataset.to_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv', index=False, header=True, mode='w')

r (Yes): 211, Size of already sampled: 1296, Random Index: 806099
Counter (Total): 1166, Counter (Yes): 211, Size of already sampled: 1297, Random Index: 389064
Counter (Total): 1167, Counter (Yes): 211, Size of already sampled: 1298, Random Index: 102192
Counter (Total): 1168, Counter (Yes): 211, Size of already sampled: 1299, Random Index: 346876
Counter (Total): 1169, Counter (Yes): 211, Size of already sampled: 1300, Random Index: 626523
Counter (Total): 1170, Counter (Yes): 211, Size of already sampled: 1301, Random Index: 197768
Counter (Total): 1171, Counter (Yes): 211, Size of already sampled: 1302, Random Index: 53484
Counter (Total): 1172, Counter (Yes): 212, Size of already sampled: 1303, Random Index: 259236
Counter (Total): 1173, Counter (Yes): 212, Size of already sampled: 1304, Random Index: 766109
Counter (Total): 1174, Counter (Yes): 212, Size of already sampled: 1305, Random Index: 822989
Counter (Total): 1175, Counter (Yes): 212, Size of already sampled: 1306, Random

# Resample 1 more row due to inaccurate prediction

In [8]:
# content_analysis_dataset = read_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv')
# experiment_dataset_total_rows = experiment_dataset.shape[0]
# indices_already_sampled = list(experiment_dataset.loc[experiment_dataset['comment_id'].isin(content_analysis_dataset['comment_id'])].index)
# print(indices_already_sampled)

In [9]:
# from random import choice

# while True:
#     random_index = choice([i for i in range(0, experiment_dataset_total_rows) if i not in indices_already_sampled])
#     indices_already_sampled.append(random_index)
#     row = experiment_dataset.loc[random_index]
#     body = row['body']
#     comment_is_by_author = row['comment_is_by_author']
#     dialogue_act_classification = dac_factory.classify(body)
#     prediction = pcc_clf.predict(
#         DataFrame(
#             {
#                 'body': [body], 
#                 'comment_is_by_author': [comment_is_by_author], 
#                 'dialogue_act_classification_ml': [dialogue_act_classification]
#             }))

#     if prediction[0] == 'Yes':
#         row['dialogue_act_classification_ml'] = dialogue_act_classification
#         row['topic_keywords'] = ''
#         row['program_comprehension_challenge'] = prediction[0]
#         row['problem_encountered'] = ''
#         content_analysis_dataset = content_analysis_dataset.append(row)
#         break

#     print(f'Size of already sampled: {len(indices_already_sampled)}, Random Index: {random_index}')

# content_analysis_dataset.to_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv', index=False, header=True, mode='w')