# User Rep Scores
- Goal: output a csv for each user in the datahunt, with each row corresponding to a single answer
- only looking at the parent questions for now
- also the ordinal questions are not in number form so I had to manually convert them (ex. T1.Q12)


In [150]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

In [177]:
# Kai's convergence answer csv
answers_df = pd.read_csv("convergence/Answer_Consensus.csv")
answers_df

Unnamed: 0.1,Unnamed: 0,Task File,Article Number,Question Label,Answer Label
0,0,Evidence,1712,T1.Q11,T1.Q11.A2
1,1,Evidence,1712,T1.Q13,T1.Q13.A5
2,2,Evidence,1712,T1.Q14,T1.Q14.A7
3,3,Evidence,1712,T1.Q3,0
4,4,Evidence,1712,T1.Q4,T1.Q4.A4
...,...,...,...,...,...
313,313,Probability,100026,T1.Q5,T1.Q5.A1
314,314,Probability,100026,T1.Q6,T1.Q6.A1
315,315,Reasoning,1712,T1.Q10,T1.Q10.A8
316,316,Reasoning,1712,T1.Q3,T1.Q3.A7


In [178]:
def get_question_base_label(row):
    if "A" not in row['Question Label']:
        return row['Question Label']
    else:
        base_end_index = row['Question Label'].index("A") - 1
        return row['Question Label'][:base_end_index]

In [179]:
answers_df["Question Base Label"] = answers_df.apply(get_question_base_label, axis=1)
answers_df

Unnamed: 0.1,Unnamed: 0,Task File,Article Number,Question Label,Answer Label,Question Base Label
0,0,Evidence,1712,T1.Q11,T1.Q11.A2,T1.Q11
1,1,Evidence,1712,T1.Q13,T1.Q13.A5,T1.Q13
2,2,Evidence,1712,T1.Q14,T1.Q14.A7,T1.Q14
3,3,Evidence,1712,T1.Q3,0,T1.Q3
4,4,Evidence,1712,T1.Q4,T1.Q4.A4,T1.Q4
...,...,...,...,...,...,...
313,313,Probability,100026,T1.Q5,T1.Q5.A1,T1.Q5
314,314,Probability,100026,T1.Q6,T1.Q6.A1,T1.Q6
315,315,Reasoning,1712,T1.Q10,T1.Q10.A8,T1.Q10
316,316,Reasoning,1712,T1.Q3,T1.Q3.A7,T1.Q3


In [180]:
#array of questions that are "parent" questions
language_parents = ["T1.Q1"]
probability_parents = ["T1.Q1", "T1.Q2", "T1.Q5", "T1.Q6", ]
reasoning_parents = ["T1.Q1"]
evidence_parents = ["T1.Q1", "T1.Q9", "T1.Q12"]

#corresponding list of question types, language_parents[n] maps to language_parent_types[n]
language_parents_types = ["select_all"]
probability_parents_types = ["ordinal", "ordinal", "ordinal", "select_one", "ordinal", "ordinal"]
reasoning_parents_types = ["select_all"]
evidence_parents_types = ["select_one", "ordinal", "ordinal"]

In [181]:
#fill this in, either "language", "probability", "reasoning", or "evidence"
task_type = "reasoning"

In [182]:
#cuts down answers_df to the rows with the relevant task type
if task_type == "language":
    task_answers_df = answers_df.loc[answers_df["Task File"] == "Language"]
    parent_questions = language_parents
elif task_type == "probability":
    task_answers_df = answers_df.loc[answers_df["Task File"] == "Probability"]
    parent_questions = probability_parents
elif task_type == "reasoning":
    task_answers_df = answers_df.loc[answers_df["Task File"] == "Reasoning"]
    parent_questions = reasoning_parents
elif task_type == "evidence":
    task_answers_df = answers_df.loc[answers_df["Task File"] == "Evidence"]
    parent_questions = evidence_parents
else:
    print("Invalid task type")


In [183]:
#read csv into df, set parent_df to the relevant columns of the parent questions
df = pd.read_csv("testing-format/BETA_Language-2020-05-20T0110-DataHunt.csv")
df = df[["contributor_uuid", "quiz_task_uuid", "article_number", "question_label", 
         "answer_label", "answer_text", "quiz_taskrun_uuid", "finish_time"]]

#creates a dictionary mapping the task type to an list of which questions are parent questions
task_parents = {"language": language_parents, "probability": probability_parents,
                "reasoning": reasoning_parents, "evidence": evidence_parents}
task_parents_types = {"language": language_parents_types, "probability": probability_parents_types,
                "reasoning": reasoning_parents_types, "evidence": evidence_parents_types}


parent_df = df.loc[df['question_label'].isin(task_parents[task_type])]

In [184]:
#adds a column with the question_type of the question_label
question_types = []
for i in parent_df['question_label']:
    question_type = task_parents_types[task_type][task_parents[task_type].index(i)]
    question_types.append(question_type)
        
parent_df.insert(2, "question_type", question_types, True)
parent_df

Unnamed: 0,contributor_uuid,quiz_task_uuid,question_type,article_number,question_label,answer_label,answer_text,quiz_taskrun_uuid,finish_time
0,f9143626-bfe0-4e69-b652-6d1525ab4eb0,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,select_all,1712,T1.Q1,T1.Q1.A2,Exaggeration,834a1f2e-ec75-48b3-976c-f8afc38601fb,2019-09-04 22:50:32.477936
1,f9143626-bfe0-4e69-b652-6d1525ab4eb0,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,select_all,1712,T1.Q1,T1.Q1.A2,Exaggeration,834a1f2e-ec75-48b3-976c-f8afc38601fb,2019-09-04 22:50:32.477936
2,f9143626-bfe0-4e69-b652-6d1525ab4eb0,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,select_all,1712,T1.Q1,T1.Q1.A5,Shock or surprise,834a1f2e-ec75-48b3-976c-f8afc38601fb,2019-09-04 22:50:32.477936
3,f9143626-bfe0-4e69-b652-6d1525ab4eb0,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,select_all,1712,T1.Q1,T1.Q1.A6,Slang,834a1f2e-ec75-48b3-976c-f8afc38601fb,2019-09-04 22:50:32.477936
4,f9143626-bfe0-4e69-b652-6d1525ab4eb0,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,select_all,1712,T1.Q1,T1.Q1.A12,Other problem with language,834a1f2e-ec75-48b3-976c-f8afc38601fb,2019-09-04 22:50:32.477936
...,...,...,...,...,...,...,...,...,...
3269,782cf148-ffac-43f1-99aa-27a9d64b5212,c251992c-ad2b-4586-bc70-f98f62f8b402,select_all,100004,T1.Q1,T1.Q1.A7,Typos,06c101df-54fe-462d-8984-f32e77f304e0,2019-09-19 23:16:03.359649
3274,40e8da50-18a7-4eaf-aba1-69b3ee956fc1,c251992c-ad2b-4586-bc70-f98f62f8b402,select_all,100004,T1.Q1,T1.Q1.A1,Metaphorical Language,9d95dbe6-662e-4cf8-aaca-7ac6c7d3e65d,2019-09-20 07:57:36.239145
3279,3c97d749-8996-4a96-97af-73ecdc5a4716,c251992c-ad2b-4586-bc70-f98f62f8b402,select_all,100004,T1.Q1,T1.Q1.A7,Typos,a87ae1c1-7019-4ef8-b33a-d2bd22d689d9,2019-09-21 00:52:12.951237
3284,a31c365c-986b-4dbe-b752-bbb5986cc7f2,c251992c-ad2b-4586-bc70-f98f62f8b402,select_all,100004,T1.Q1,T1.Q1.A7,Typos,fbc4942e-379d-4726-b6be-629f28208278,2019-09-28 08:03:12.014527


In [186]:
#task_answers_df.loc[task_answers_df["Question Base Label"] == 'T1.Q1']
task_answers_df
#answers_df.loc[answers_df["Task File"] == "Reasoning"]

Unnamed: 0.1,Unnamed: 0,Task File,Article Number,Question Label,Answer Label,Question Base Label
315,315,Reasoning,1712,T1.Q10,T1.Q10.A8,T1.Q10
316,316,Reasoning,1712,T1.Q3,T1.Q3.A7,T1.Q3
317,317,Reasoning,1712,T1.Q9,T1.Q9.A5,T1.Q9


In [119]:
#NEED TO FIX THIS SO THAT THE CONVERGED ANSWERS COME FROM KAIS CSV
#converged_answers = [['T1.Q1.A2']]
converged_answers = []

for i in np.arange(len(parent_questions)):
    answers = []
    converged_answers.append(answers)
    
converged_answers

[[]]

"Select All That Apply" Questions

In [10]:
#selected is one answer_label, correct is array of correct answer_labels
def selectAllQ(selected, correct):
    if selected in correct:
        return 1
    else:
        return 0    

"Select One" Questions

In [11]:
def selectOneQ(selected, correct):
    if selected == correct:
        return 1
    return 0

Ordinal Data Questions

In [12]:
#uses the last value of the question label, ex. 1 in "T1.Q9.A1", as the numerical representation of the answer 
def question_to_int(selection):
    return int(selection[-1])

v_question_to_int = np.vectorize(question_to_int)

In [13]:
#selected and correct are answer_label(s), question is a question_label, both strings
def ordinalDataQ(selected, question, correct):
    data = parent_df[parent_df['question_label'].str.match(question)]
    data = data['answer_label'].values
    #array of 1,2,3,4s
    data_int = v_question_to_int(data)
    selected_int = question_to_int(selected)
    correct_int = question_to_int(correct)
    
    std = np.std(data_int)
    z_score = (selected_int - correct_int)/std
    return 1 - abs(z_score)/3

In [14]:
user_score_dict = {}
test_dict = {}
for index, row in parent_df.iterrows():
    if row['contributor_uuid'] not in user_score_dict:
        user_score_dict[row['contributor_uuid']] = pd.DataFrame(columns=['quiz_task_uuid', 'quiz_taskrun_uuid', 'question_label', 'question_type', 'answer_label', 'answer_score', 'time_stamp'])
    if row['question_type'] == "select_all":
        score = selectAllQ(row['answer_label'], converged_answers[task_parents[task_type].index(row['question_label'])])
    elif row['question_type'] == "select_one":
        score = selectOneQ(row['answer_label'], converged_answers[task_parents[task_type].index(row['question_label'])])
    else:
        score = ordinalDataQ(row['answer_label'], row['question_label'], converged_answers[task_parents[task_type].index(row['question_label'])])
    user_score_dict[row['contributor_uuid']] = user_score_dict[row['contributor_uuid']].append({'quiz_task_uuid': row['quiz_task_uuid'], 'quiz_taskrun_uuid': row['quiz_taskrun_uuid'], 'question_label': row['question_label'], 'question_type': row['question_type'], 'answer_label': row['answer_label'], 'answer_score': score, 'time_stamp': row['finish_time']}, ignore_index=True)
        
    

In [15]:
#example output csv for a given contributor_uuid
user_score_dict['9edd2824-48d3-49db-97ae-9713b37592d3']

Unnamed: 0,quiz_task_uuid,quiz_taskrun_uuid,question_label,question_type,answer_label,answer_score,time_stamp
0,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,cc0923e0-258b-42f7-86df-d68ef9d4d54c,T1.Q1,select_all,T1.Q1.A2,1,2019-09-20 07:40:45.253905
1,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,cc0923e0-258b-42f7-86df-d68ef9d4d54c,T1.Q1,select_all,T1.Q1.A4,0,2019-09-20 07:40:45.253905
2,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,cc0923e0-258b-42f7-86df-d68ef9d4d54c,T1.Q1,select_all,T1.Q1.A5,0,2019-09-20 07:40:45.253905
3,a25ed7d6-4f1b-40a4-9f72-54e13e5658eb,cc0923e0-258b-42f7-86df-d68ef9d4d54c,T1.Q1,select_all,T1.Q1.A6,0,2019-09-20 07:40:45.253905
4,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A1,0,2019-09-20 07:44:25.657454
5,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A3,0,2019-09-20 07:44:25.657454
6,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A3,0,2019-09-20 07:44:25.657454
7,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A3,0,2019-09-20 07:44:25.657454
8,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A4,0,2019-09-20 07:44:25.657454
9,f65e3f32-7347-42b3-a6b2-39bc62e226c2,76439f35-ad5a-482c-a1d6-8ca8bf05332b,T1.Q1,select_all,T1.Q1.A4,0,2019-09-20 07:44:25.657454


In [16]:
#group by user group by task type avg var
def variances(df):
    #assume df passed in is pd df and organized like Jay's drawing
    df = df[["question_type", "answer_score"]]
    df = df.groupby("question_type").agg({'answer_score': np.var})
    return df
    

In [17]:
variances(user_score_dict['9edd2824-48d3-49db-97ae-9713b37592d3'])

Unnamed: 0_level_0,answer_score
question_type,Unnamed: 1_level_1
select_all,0.150612
