In [1]:
import os
import yaml
import pickle

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

# Data

In [2]:
with open('../../data/post_test/some_annotated.pkl', 'rb') as fp:
    some = pickle.load(fp)
    
with open('../../data/post_test/some_questions_annotated.pkl', 'rb') as fp:
    some_questions = pickle.load(fp)
    
with open('../../data/post_test/questions_annotated.pkl', 'rb') as fp:
    questions = pickle.load(fp)
    
with open('../../data/post_test/rankings_annotated.pkl', 'rb') as fp:
    rankings = pickle.load(fp)

In [3]:
def process_q1(label):
    """
    processes the q1 labels (about width change) and returns the label 1 for correct, and 0 for anything else
    """
    correct = ['correct']
    wrong = [
        'none', 'unrelated', 'inverse', 'to check', 'larger', 'smaller', 'nans', 'same'
    ]
    if label in correct:
        return 1
    else:
        return 0
    
def process_q2(label):
    """
    processes the q2 labels (about concentration change) and returns the label 1 for correct, and 0 for anything else
    """
    correct = ['correct']
    wrong = [
        'nans', 'larger', 'inverse', 'unrelated', 'mathematical', 'smaller', 'same'
    ]
    if label in correct:
        return 1
    else:
        return 0
    
def process_q3(label):
    """
    processes the q3 labels (about width and concentration change) and returns the label 1 for correct, and 0 for anything else
    """
    correct = ['correct', 'same']
    wrong = [
        'nans', 'half', 'double', 'unrelated', 'larger', 'smaller', 'quadruple'
    ]
    if label in correct:
        return 1
    else:
        return 0
    
def process_q4(label):
    """
    processes the q4 labels (about width and concentration change) and returns the label 1 for correct, and 0 for anything else
    """
    correct = ['correct']
    wrong = [
        'nans', 'smaller', 'fourth', 'larger', 'same', 'inverse', 'half', 'unrelated', 'quadruple', 
        'double', 'third', 'to check', 'triple'
    ]
    if label in correct:
        return 1
    else:
        return 0

def process_conf(conf):
    if conf == 'missing':
        return 0
    else:
        return float(conf)
    
def process_q5(row, weight_violet=0, weight_blue=1, weight_yellow=0, weight_red=0):
    """
    Processing the 5th question (so labelled in the dataframe, but actually the 7th from the last version [4])
    Assuming that here, only the bestest of answer is the only one that counts by default.
    """
    conf_violet = process_conf(row['q5_colour0'])
    conf_blue = process_conf(row['q5_colour1'])
    conf_yellow = process_conf(row['q5_colour2'])
    conf_red = process_conf(row['q5_colour3'])
    
    confidences = [conf_violet, conf_blue, conf_yellow, conf_red]
    index = np.argmax(confidences)
    if index == 1:
        return 1
    else:
        return 0
    
def process_q5_conf(row, weight_violet=0, weight_blue=1, weight_yellow=0, weight_red=0):
    """
    Processing the 5th question (so labelled in the dataframe, but actually the 7th from the last version [4])
    Assuming that here, only the bestest of answer is the only one that counts by default.
    """
    conf_violet = process_conf(row['q5_colour0'])
    conf_blue = process_conf(row['q5_colour1'])
    conf_yellow = process_conf(row['q5_colour2'])
    conf_red = process_conf(row['q5_colour3'])
    
    score = conf_violet * weight_violet 
    score += conf_blue * weight_blue
    score += conf_yellow * weight_yellow
    score += conf_red * weight_red
    return score

def process_q6(row, weight_cobalt=0, weight_kchromate=1, weight_copper=0, weight_kpermanganate=0):
    """
    Processing the 6th question (so labelled in the dataframe, but actually the 8th from the last version [4])
    Assuming that here, only the bestest of answer is the only one that counts by default.
    """
    conf_cobalt = process_conf(row['q6_colour0'])
    conf_kchromate = process_conf(row['q6_colour1'])
    conf_copper = process_conf(row['q6_colour2'])
    conf_kpermanganate = process_conf(row['q6_colour3'])
    
    confidences = [conf_cobalt, conf_kchromate, conf_copper, conf_kpermanganate]
    index = np.argmax(confidences)
    if index == 1:
        return 1
    else:
        return 0

def process_q6_conf(row, weight_cobalt=0, weight_kchromate=1, weight_copper=0, weight_kpermanganate=0):
    """
    Processing the 6th question (so labelled in the dataframe, but actually the 8th from the last version [4])
    Assuming that here, only the bestest of answer is the only one that counts by default.
    """
    conf_cobalt = process_conf(row['q6_colour0'])
    conf_kchromate = process_conf(row['q6_colour1'])
    conf_copper = process_conf(row['q6_colour2'])
    conf_kpermanganate = process_conf(row['q6_colour3'])
    
    score = conf_cobalt * weight_cobalt
    score += conf_kchromate * weight_kchromate
    score += conf_copper * weight_copper
    score += conf_kpermanganate * weight_kpermanganate
    return score
    

In [4]:
def process_scores(df):
    df['q1_score'] = df['q1_label'].apply(process_q1)
    df['q2_score'] = df['q2_label'].apply(process_q2)
    df['q3_score'] = df['q3_label'].apply(process_q3)
    df['q4_score'] = df['q4_label'].apply(process_q4)
    df['q5_score'] = df.apply(process_q5, axis=1)
    df['q6_score'] = df.apply(process_q6, axis=1)
    df['q5_score_conf'] = df.apply(process_q5_conf, axis=1)
    df['q6_score_conf'] = df.apply(process_q6_conf, axis=1)
    return df

In [5]:
some = process_scores(some)
some_questions = process_scores(some_questions)
questions = process_scores(questions)
rankings = process_scores(rankings)

In [6]:
with open('../../data/post_test/some_scored.pkl', 'wb') as fp:
    pickle.dump(some, fp)
    
with open('../../data/post_test/some_questions_scored.pkl', 'wb') as fp:
    pickle.dump(some_questions, fp)
    
with open('../../data/post_test/questions_scored.pkl', 'wb') as fp:
    pickle.dump(questions, fp)
    
with open('../../data/post_test/rankings_scored.pkl', 'wb') as fp:
    pickle.dump(rankings, fp)