In [1]:
import pandas as pd
import numpy as np
import sys
from parsing import parse_chart_name 
import os

In [2]:
def get_MTurk_data(csv_file):
    df = pd.read_csv(csv_file)
    print('--------------------------------')
    print(csv_file)
    print()
    print(f'Initial df size: {len(df)}')
    print('--------------------------------')
    return df

In [3]:
def process_MTurk_data(data, category):
    label = str()
    if category in ['trust', 'readability']:
        label = 'explain' if category == 'readability' else 'trust'
    else:
        raise InputError(f'Category "{category}" not known!')
        return data
    
    # select just the columns needed
    selected_columns = ['WorkerId']
    for i in range(1, 13):
        answer = f'Answer.{label}'
        if i > 1:
            answer += str(i)
        selected_columns.extend([f'Input.img{i}', answer])
    question_id_columns = selected_columns[1::2]
    df = data.loc[:, selected_columns]
    
    # parse the link to get the question_id
    def split_input_img(x):
        return x.str.split('/').str[-1]
    
    df[question_id_columns] = df[question_id_columns].apply(split_input_img)
    
    # group by question_id
    column_names = ['question_id', 'worker_id', 'answer']
    questions = pd.DataFrame(columns=column_names)
    for i in range(1, 13):
        answer = f'Answer.{label}'
        if i > 1:
            answer += str(i)
        to_concat = df[[f'Input.img{i}', 'WorkerId', answer]].set_axis(column_names, axis=1, inplace=False)
        questions = pd.concat([questions, to_concat])

    print(f'Size after processing data: {len(questions)} = 12 questions * {len(data)} records.')
    print()
    return questions

In [4]:
def is_processed_batch(file, source):
    params = dict()
    processed = False
    if os.path.isfile(file):
        sources = list(pd.read_csv(file, usecols=['source']).source.unique())
        if source in sources:
            processed = True
        else:
            params['mode'] = 'a'
            params['header'] = None
    return (processed, params)

In [5]:
def update_ML_csv(Readability, Trust, batch_number):
    source = f'mturk-batch{batch_number}'
    
    processedR, paramsR = is_processed_batch('MLR.csv', source)
    processedT, paramsT = is_processed_batch('MLT.csv', source)
    
    if not processedR:
        R = Readability.drop(columns=['worker_id'])
        R['source'] = source
        R.to_csv('MLR.csv', index=False, **paramsR)
        print(f'Updated MLR.csv with the latest "{source}" !')
    else:
        print(f'Readability {source} already processed!')
    
    if not processedT:
        T = Trust.drop(columns=['worker_id'])
        T['source'] = source
        T.to_csv('MLT.csv', index=False, **paramsT)
        print(f'Updated MLT.csv with the latest "{source}" !')
    else:
        print(f'Trust {source} already processed!')

In [6]:
Readability = process_MTurk_data(get_MTurk_data('MTurk_Read_Clean.csv'), 'readability')
Trust = process_MTurk_data(get_MTurk_data('MTurk_Trust_Clean.csv'), 'trust')

update_ML_csv(Readability, Trust, 1)

--------------------------------
MTurk_Read_Clean.csv

Initial df size: 1703
--------------------------------
Size after processing data: 20436 = 12 questions * 1703 records.

--------------------------------
MTurk_Trust_Clean.csv

Initial df size: 1702
--------------------------------
Size after processing data: 20424 = 12 questions * 1702 records.

Updated MLR.csv with the latest "mturk-batch1" !
Updated MLT.csv with the latest "mturk-batch1" !
