In [1]:
import numpy as np
import pandas as pd
import re
import json
from datetime import datetime
import json
from os import listdir
from os.path import isfile, join

# Step 0. Grab data

In [2]:
datahunt = pd.read_csv('../IAA-consensus/evidence_eric/Covid_Evidencev1-Task-2224-DataHunt.csv')
schema = pd.read_csv('../IAA-consensus/evidence_eric/45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce1626cf4206e159819c.csv')
iaa = pd.read_csv('../IAA-consensus/evidence_eric/Covid_Evidencev1.IAA-edb1510f-1923-4d6f-a678-95f53d752bea-Tags.csv')
adj = pd.read_csv('../IAA-consensus/evidence_eric/Covid_Evidence2020_03_21.adjudicated-edb1510f-1923-4d6f-a678-95f53d752bea-Tags.csv')
#preprocessing = pd.read_csv('../urs-preprocessing/datahunt_tracking.json')

with open('../urs-preprocessing/datahunt_tracking.json', 'r') as fp:
    preprocessing = json.load(fp)

#urs = pd.read_csv('../User Rep Score/score.csv')

# Step 1. Preprocessing

In [3]:
datahunt["finish_time_object"] = datahunt["finish_time"].apply(datetime.fromisoformat)
datahunt = datahunt.sort_values("finish_time_object")

In [4]:
datahunt_name_regex = re.compile('[^-]*')

In [5]:
# this errors because of problems with getting the name of datahunts

In [6]:
# if len(datahunt) >= preprocessing[re.search(datahunt_name_regex, 'Covid_Evidencev1-Task-2224-DataHunt.csv').group()]:
#     print("No unprocessed rows in this datahunt csv")
# else:
#     datahunt = datahunt.iloc[preprocessing[re.search(datahunt_name_regex, 'Covid_Evidencev1-Task-2224-DataHunt.csv').group()]]

# Step 2. Add IAA and adjudicated columns to datahunt csv

In [7]:
answers_iaa = iaa.merge(schema, how="inner", on="answer_uuid")
answers_adj = adj.merge(schema, how="inner", on="answer_uuid")

# filter down IAA tags file, replace nan values to prevent errors
answers_iaa = answers_iaa[["answer_uuid", "source_task_uuid", "tua_uuid",
                           "target_text", "question_label", "answer_label",
                           "question_type_x", "question_type_y", "answer_count",
                           "alpha_distance"]]

answers_iaa = answers_iaa.replace(np.nan, '', regex=True)

# filter down adjudicated/gold standard tags file, replace nan values to prevent errors
answers_adj = answers_adj[["answer_uuid", "source_task_uuid", "tua_uuid",
                           "target_text", "question_label", "answer_label",
                           "question_type", "answer_count", "alpha_distance"]]

answers_adj = answers_adj.replace(np.nan, '', regex=True)

In [8]:
def get_consensus(answers, question_label, quiz_task_uuid):
    answer_df = answers.loc[(answers["question_label"] == question_label)
                         & (answers["source_task_uuid"] == quiz_task_uuid)]
    
    return list(set(answer_df["answer_label"].tolist()))

In [9]:
datahunt['iaa_consensus'] = datahunt.apply(lambda x: get_consensus(answers_iaa, x['question_label'], x['quiz_task_uuid']), axis=1)

In [10]:
datahunt['adj_consensus'] = datahunt.apply(lambda x: get_consensus(answers_adj, x['question_label'], x['quiz_task_uuid']), axis=1)

In [11]:
def get_question_meta(answers_iaa, answers_adj, question_label, quiz_task_uuid):
    answer_iaa = answers_iaa.loc[(answers_iaa["question_label"] == question_label)
                         & (answers_iaa["source_task_uuid"] == quiz_task_uuid)]
    
    answer_adj = answers_adj.loc[(answers_adj["question_label"] == question_label)
                         & (answers_adj["source_task_uuid"] == quiz_task_uuid)]
    
    if len(answer_iaa["question_type_y"]) > 0 and len(answer_iaa["answer_count"]) > 0:
        question_type_y = answer_iaa["question_type_y"].iloc[0]
        num_answer_choices = answer_iaa["answer_count"].iloc[0]

        if question_type_y == "RADIO":
            question_type_y = answer_iaa["alpha_distance"].iloc[0].upper()

        return (question_type_y, num_answer_choices)
    
    elif len(answer_adj["question_type"]) > 0 and len(answer_adj["answer_count"]) > 0:
        question_type = answer_adj["question_type"].iloc[0]
        num_answer_choices = answer_adj["answer_count"].iloc[0]

        if question_type == "RADIO":
            question_type = answer_adj["alpha_distance"].iloc[0].upper()

        return (question_type, num_answer_choices)
    
    else:
        return ()

In [12]:
datahunt['question_meta'] = datahunt.apply(lambda x: get_question_meta(answers_iaa, answers_adj,
                                                                       x['question_label'],
                                                                       x['quiz_task_uuid']), axis=1)

# Step 3. Update user rep scores 

In [13]:
def set_sum(series):
    ret = set()
    for s in series:
        ret = ret.union(s)
    return ret

def into_set(series):
    ret = set()
    for s in series:
        ret.add(s)
    return ret

In [14]:
data = datahunt
#data['adj_consensus'] = data['adj_consensus'].str.replace('\'', '').str.replace('[', '').str.replace(']', '').str.split(", ").apply(lambda x: set(x))
#data['iaa_consensus'] = data['iaa_consensus'].str.replace('\'', '').str.replace('[', '').str.replace(']', '').str.split(", ").apply(lambda x: set(x))


#Reviewers' current reputation score
old_rep_scores = pd.read_csv("../User Rep Score/score.csv").set_index('contributor_uuid')
old_rep_scores = {user: old_rep_scores.loc[user, "score"] if user in old_rep_scores.index else 0.5 
              for user in data["contributor_uuid"].unique()}
rep_scores = {user:0 for user in old_rep_scores.keys()}

#Produce pivot tables we need for calculating consensus
adj_table = pd.pivot_table(data, values='adj_consensus', index='article_number', columns='question_label', aggfunc=set_sum)
iaa_table = pd.pivot_table(data, values='iaa_consensus', index='article_number', columns='question_label', aggfunc=set_sum)

In [15]:
df = pd.pivot_table(data,
                    values='answer_label', 
                    index='contributor_uuid', 
                    columns='question_label', 
                    aggfunc=into_set)
for user in df.index:
    review = df.loc[df.index == user, :]
    n_adj_answers = 0
    n_correct = 0
    for question_label in df.columns:
        if question_label in adj_table.columns and len(adj_table[question_label].iloc[0]) > 0:
            n_adj_answers += 1
            adj_ans = adj_table[question_label].iloc[0]
            user_ans = review.loc[:, question_label].iloc[0]
            if type(user_ans) is set and len(user_ans.intersection(adj_ans)) > 0:
                n_correct += 1
    rep_scores[user] = n_correct/n_adj_answers
                
for user in rep_scores.keys():
    rep_scores[user] = 0.5 * rep_scores[user] + 0.5 * old_rep_scores[user]

In [16]:
old_rep_scores = rep_scores
rep_scores = {user:[] for user in old_rep_scores.keys()}

df = pd.pivot_table(data,
                    values='answer_label', 
                    index='contributor_uuid', 
                    columns='question_label', 
                    aggfunc=into_set)
for user in df.index:
    review = df.loc[df.index == user, :]
    n_iaa_answers = 0
    n_correct = 0
    for question_label in df.columns:
        if question_label in iaa_table.columns and len(iaa_table[question_label].iloc[0]) > 0:
            n_iaa_answers += 1
            iaa_ans = iaa_table[question_label].iloc[0]
            user_ans = review.loc[:, question_label].iloc[0]
            if type(user_ans) is set and len(user_ans.intersection(iaa_ans)) > 0:
                n_correct += 1
    rep_scores[user].append(n_correct/n_iaa_answers)
                
for user in rep_scores.keys():
    rep_scores[user] = 0.3 * sum(rep_scores[user])/len(rep_scores[user]) + 0.7 * old_rep_scores[user]

In [17]:
# Update the repscore csv file
csv = pd.read_csv("../User Rep Score/score.csv").set_index('contributor_uuid')
for user in rep_scores.keys():
    if user in csv.index:
        csv.loc[user, 'score'] = rep_scores[user]
    else:
        helper_dict = {user:rep_scores[user]}
        helper_df = pd.DataFrame.from_dict(helper_dict, orient="index")
        helper_df.columns = ['score']
        csv = csv.append(helper_df)
csv = csv.reset_index()
csv.columns = ['contributor_uuid', 'score']
csv.to_csv("../User Rep Score/score.csv", index=False)

# Step 4. Update preprocessing csv

In [18]:
# if datahunt_name_regex in preprocessing:
#         file_size = len(pd.read_csv(join("../IAA-consensus/evidence_eric/", 'Covid_Evidencev1-Task-2224-DataHunt.csv')))
#         preprocessing[datahunt_name_regex] = max(preprocessing[datahunt_name_regex], file_size)
# else:
#         file_size = len(pd.read_csv(join("../IAA-consensus/evidence_eric/", 'Covid_Evidencev1-Task-2224-DataHunt.csv')))
#         preprocessing[datahunt_name_regex] = file_size