# User Rep Scores
- Goal: output a csv for each user in the datahunt, with each row corresponding to a single answer
- only looking at the parent questions for now
- so far this only works for language, because the schemas are not updated and I didn't wanna go through and identify which questions were "parent questions" for every task
- also the ordinal questions are not in number form so I had to manually convert them (ex. T1.Q12)




In [2]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

In [4]:
# Kai's convergence answer csv
answers_df = pd.read_csv("convergence/Answer_Consensus.csv")
answers_df

Unnamed: 0.1,Unnamed: 0,Task File,Article Number,Question Label,Answer Label
0,0,Evidence,1712,T1.Q11,T1.Q11.A2
1,1,Evidence,1712,T1.Q13,T1.Q13.A5
2,2,Evidence,1712,T1.Q14,T1.Q14.A7
3,3,Evidence,1712,T1.Q3,0
4,4,Evidence,1712,T1.Q4,T1.Q4.A4
...,...,...,...,...,...
313,313,Probability,100026,T1.Q5,T1.Q5.A1
314,314,Probability,100026,T1.Q6,T1.Q6.A1
315,315,Reasoning,1712,T1.Q10,T1.Q10.A8
316,316,Reasoning,1712,T1.Q3,T1.Q3.A7


In [10]:
#array of questions that are "parent" questions
language_parents = ["T1.Q1"]
probability_parents = ["T1.Q1", "T1.Q2", "T1.Q5", "T1.Q6", ]
reasoning_parents = ["T1.Q1"]
evidence_parents = ["T1.Q1", "T1.Q9", "T1.Q12"]

#corresponding list of question types, language_parents[n] maps to language_parent_types[n]
language_parents_types = ["select_all"]
probability_parents_types = ["ordinal", "ordinal", "ordinal", "select_one", "ordinal", "ordinal"]
reasoning_parents_types = ["select_all"]
evidence_parents_types = ["select_one", "ordinal", "ordinal"]

In [11]:
#fill this in, either "language", "probability", "reasoning", or "evidence"
task_type = "language"

In [12]:
#df_huh = pd.read_csv("newDataFormat/BETA_Language-2020-01-18T0225-DataHuntSubmitted.csv")

In [13]:
#read csv into df, set parent_df to the relevant columns of the parent questions
df = pd.read_csv("newDataFormat/BETA_Reasoning-2020-03-01T0217-DataHunt.csv")
df = df[["contributor_uuid", "quiz_task_uuid", "article_number", "question_label", 
         "answer_label", "answer_text", "quiz_taskrun_uuid", "finish_time"]]

task_parents = {"language": language_parents, "probability": probability_parents,
                "reasoning": reasoning_parents, "evidence": evidence_parents}
task_parents_types = {"language": language_parents_types, "probability": probability_parents_types,
                "reasoning": reasoning_parents_types, "evidence": evidence_parents_types}


parent_df = df.loc[df['question_label'].isin(task_parents[task_type])]

In [14]:
converged_answers = [['T1.Q1.A2']]

In [15]:
answer_df = pd.read_csv("convergence/Answer_Consensus.csv")
answer_df[answer_df["Article Number"] == 1712]

Unnamed: 0.1,Unnamed: 0,Article Number,Question Label,Answer Label
0,0,1712,T1.Q11,T1.Q11.A2
1,1,1712,T1.Q13,T1.Q13.A5
2,2,1712,T1.Q14,T1.Q14.A7
3,3,1712,T1.Q3,0
4,4,1712,T1.Q4,T1.Q4.A4
...,...,...,...,...
249,249,1712,T1.Q7.A3,0
250,250,1712,T1.Q7.A1,0
315,315,1712,T1.Q10,T1.Q10.A8
316,316,1712,T1.Q3,T1.Q3.A7


In [16]:
#adds a column with the question_type of the question_label
question_types = []
for i in parent_df['question_label']:
    question_type = task_parents_types[task_type][task_parents[task_type].index(i)]
    question_types.append(question_type)
        
parent_df.insert(2, "question_type", question_types, True)
parent_df

Unnamed: 0,contributor_uuid,quiz_task_uuid,question_type,article_number,question_label,answer_label,answer_text,quiz_taskrun_uuid,finish_time
0,0e4b8a3e-2714-4b5f-ba10-bb576aaa5cc6,214c118d-3a84-45a0-a2ba-353626492df9,select_all,1712,T1.Q1,T1.Q1.A2,An evaluative judgment (claims about something...,4ed863f3-c5c5-407a-893f-86f947362e24,2019-11-02 00:13:48.572118


"Select All That Apply" Questions

In [17]:
#selected is one answer_label, correct is array of correct answer_labels
def selectAllQ(selected, correct):
    if selected in correct:
        return 1
    else:
        return 0    

"Select One" Questions

In [18]:
def selectOneQ(selected, correct):
    if selected == correct:
        return 1
    return 0

Ordinal Data Questions

In [9]:
#uses the last value of the question label, ex. 1 in "T1.Q9.A1", as the numerical representation of the answer 
def question_to_int(selection):
    return int(selection[-1])

v_question_to_int = np.vectorize(question_to_int)

In [10]:
#selected and correct are answer_label(s), question is a question_label, both strings
def ordinalDataQ(selected, question, correct):
    data = parent_df[parent_df['question_label'].str.match(question)]
    data = data['answer_label'].values
    #array of 1,2,3,4s
    data_int = v_question_to_int(data)
    selected_int = question_to_int(selected)
    correct_int = question_to_int(correct)
    
    std = np.std(data_int)
    z_score = (selected_int - correct_int)/std
    return 1 - abs(z_score)/3

In [11]:
user_score_dict = {}
test_dict = {}
for index, row in parent_df.iterrows():
    if row['contributor_uuid'] not in user_score_dict:
        user_score_dict[row['contributor_uuid']] = pd.DataFrame(columns=['quiz_task_uuid', 'quiz_taskrun_uuid', 'question_label', 'question_type', 'answer_label', 'answer_score', 'time_stamp'])
    if row['question_type'] == "select_all":
        score = selectAllQ(row['answer_label'], converged_answers[task_parents[task_type].index(row['question_label'])])
    elif row['question_type'] == "select_one":
        score = selectOneQ(row['answer_label'], converged_answers[task_parents[task_type].index(row['question_label'])])
    else:
        score = ordinalDataQ(row['answer_label'], row['question_label'], converged_answers[task_parents[task_type].index(row['question_label'])])
    user_score_dict[row['contributor_uuid']] = user_score_dict[row['contributor_uuid']].append({'quiz_task_uuid': row['quiz_task_uuid'], 'quiz_taskrun_uuid': row['quiz_taskrun_uuid'], 'question_label': row['question_label'], 'question_type': row['question_type'], 'answer_label': row['answer_label'], 'answer_score': score, 'time_stamp': row['finish_time']}, ignore_index=True)
        
    

In [14]:
#example output csv for a given contributor_uuid
user_score_dict['9edd2824-48d3-49db-97ae-9713b37592d3']

Unnamed: 0,quiz_task_uuid,quiz_taskrun_uuid,question_label,question_type,answer_label,answer_score,time_stamp
0,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,cc0923e0-258b-42f7-86df-d68ef9d4d54c,T1.Q1,select_all,T1.Q1.A2,1,2019-09-20 07:40:45.253905
1,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,cc0923e0-258b-42f7-86df-d68ef9d4d54c,T1.Q1,select_all,T1.Q1.A4,0,2019-09-20 07:40:45.253905
2,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,cc0923e0-258b-42f7-86df-d68ef9d4d54c,T1.Q1,select_all,T1.Q1.A5,0,2019-09-20 07:40:45.253905
3,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,cc0923e0-258b-42f7-86df-d68ef9d4d54c,T1.Q1,select_all,T1.Q1.A6,0,2019-09-20 07:40:45.253905
4,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A1,0,2019-09-20 07:44:25.657454
5,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A3,0,2019-09-20 07:44:25.657454
6,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A3,0,2019-09-20 07:44:25.657454
7,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A3,0,2019-09-20 07:44:25.657454
8,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A4,0,2019-09-20 07:44:25.657454
9,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A4,0,2019-09-20 07:44:25.657454


In [19]:
#group by user group by task type avg var
def variances(df):
    #assume df passed in is pd df and organized like Jay's drawing
    df = df[["question_type", "answer_score"]]
    df = df.groupby("question_type").agg({'answer_score': np.var})
    return df
    

In [22]:
variances(user_score_dict['9edd2824-48d3-49db-97ae-9713b37592d3'])

Unnamed: 0_level_0,answer_score
question_type,Unnamed: 1_level_1
select_all,0.150612
