In [None]:
from classifier import DialogueActClassifierFactory
from joblib import load
from pandas import DataFrame, option_context, read_csv
from pathlib import Path

pcc_clf = load('./models/program_comprehension_challenge_classifier.pickle') 
dac_factory = DialogueActClassifierFactory()
dac_factory.get_classifier(classifier_file=Path('./models/dialogue_act_classifier.pickle'), test_set_percentage=10)

test_dataset = read_csv('../master-of-engineering/Assets/BigQuery/test_dataset.csv')

FEATURES = ['body', 'dialogue_act_classification_ml', 'comment_is_by_author']
LABEL = 'program_comprehension_challenge'

X_test = test_dataset[FEATURES]
y_true = test_dataset[LABEL]

In [None]:
from sklearn import metrics

y_pred = pcc_clf.predict(X_test)
report = metrics.classification_report(y_true, y_pred, digits=8)
print(report)

In [None]:
experiment_dataset = read_csv('/Volumes/RamDisk/results_20190503_1403_cleaned.csv')
experiment_dataset_total_rows = experiment_dataset.shape[0]
print(experiment_dataset_total_rows)

In [None]:
content_analysis_dataset = read_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv')
indices_already_sampled = list(experiment_dataset.loc[experiment_dataset['comment_id'].isin(content_analysis_dataset['comment_id'])].index)
print(indices_already_sampled)

# Resample more rows due to inaccurate prediction

In [None]:
from random import choice

sample_size = 70
experiment_dataset_range = range(0, experiment_dataset_total_rows)
counter_yes = 0
counter_total = 0
while counter_yes < sample_size:
    counter_total += 1
    random_index = choice([i for i in experiment_dataset_range if i not in indices_already_sampled])
    indices_already_sampled.append(random_index)
    row = experiment_dataset.loc[random_index]
    body = row['body']
    comment_is_by_author = row['comment_is_by_author']
    dialogue_act_classification = dac_factory.classify(body)
    prediction = pcc_clf.predict(
        DataFrame(
            {
                'body': [body], 
                'comment_is_by_author': [comment_is_by_author], 
                'dialogue_act_classification_ml': [dialogue_act_classification]
            }))

    if prediction[0] == 'Yes':
        row['dialogue_act_classification_ml'] = dialogue_act_classification
        row['topic_keywords'] = ''
        row['program_comprehension_challenge'] = prediction[0]
        row['problem_encountered'] = ''
        content_analysis_dataset = content_analysis_dataset.append(row)
        counter_yes += 1  

    print(f'Counter (Total): {counter_total}, Counter (Yes): {counter_yes}, Size of already sampled: {len(indices_already_sampled)}, Random Index: {random_index}')

content_analysis_dataset.to_csv('../master-of-engineering/Assets/BigQuery/content_analysis_dataset.csv', index=False, header=True, mode='w')