In [131]:
import pandas as pd
import numpy as np

# User Monitoring Pipeline:

### Create Tables in the AWS instance

In [132]:
# Use the following SQL code in MySQL Workbench in order to create the following tables in the User_Monitoring database:
# 1) ucs: two fields [uuid, score] --> [user's unique identifier, and their current UCS score]
# 2) task_scores: four fields [ts, quiz_task_uuid, user_uuid, task_score] -->
#                             [time stamp, task identifier, user's unique identifier, task_score]
# 3) datahunt_tracker: two fields [datahunt_id, num_rows_processed] --> 
#                                 [datahunt's unique id, number of rows in the datahunt that we processed already]
"""

CREATE DATABASE User_Monitoring;

USE User_Monitoring;

CREATE TABLE ucs (
	uuid INT,
    score DECIMAL(6,5) NOT NULL
);

desc UCS;

SELECT * FROM UCS;

----------------
CREATE TABLE task_scores (
	ts TIMESTAMP,
    quiz_task_uuid INT,
    user_uuid INT,
    task_score DECIMAL(6,5)
);

desc task_scores;

SELECT * FROM task_scores;


----------------
CREATE TABLE datahunt_tracker (
	datahunt_id INT,
    num_rows_processed INT
);

desc datahunt_tracker;

SELECT * FROM datahunt_tracker;


"""

'\n\nCREATE DATABASE User_Monitoring;\n\nUSE User_Monitoring;\n\nCREATE TABLE ucs (\n\tuuid INT,\n    score DECIMAL(6,5) NOT NULL\n);\n\ndesc UCS;\n\nSELECT * FROM UCS;\n\n----------------\nCREATE TABLE task_scores (\n\tts TIMESTAMP,\n    quiz_task_uuid INT,\n    user_uuid INT,\n    task_score DECIMAL(6,5)\n);\n\ndesc task_scores;\n\nSELECT * FROM task_scores;\n\n\n----------------\nCREATE TABLE datahunt_tracker (\n\tdatahunt_id INT,\n    num_rows_processed INT\n);\n\ndesc datahunt_tracker;\n\nSELECT * FROM datahunt_tracker;\n\n\n'

### Create handler functions to interact with AWS instance

In [140]:
# Install pymysql in the current Jupyter kernel
import sys
# !conda install --yes --prefix {sys.prefix} pymysql

import pymysql
import time
# 1. Install pymysql to local directory
# pip install -t $PWD pymysql

# 2. Write code, then zip it up

# Lambda Permissions:
# AWSLambdaVPCAccessExecutionRole

# Configuration Files
endpoint = 'user-monitoring-database.crnwwfmibeif.us-west-1.rds.amazonaws.com'
username = 'admin'
password = 'user_monitoring'
database_name = 'User_Monitoring'

# Connection
connection = pymysql.connect(
    host=endpoint, user=username, passwd=password, db=database_name)
cursor = connection.cursor()

def lambda_handler(event=None, context=None):
    insert_into_table(event['table'])

def table_to_df(table):
    cursor = connection.cursor()
    cursor.execute('SELECT * from {}'.format(table))
    rows = cursor.fetchall()
    field_names = [i[0] for i in cursor.description]
    df = pd.DataFrame(columns=field_names)
    for row in rows:
        df.loc[len(df.index)] = row
    return df

def display_table(table):
    df = table_to_df(table)
    print(table)
    display(df)

def table_to_csv(table):
    df = table_to_df(table)
    df.to_csv(f'{table}.csv', index=False)

def insert_into_table(table, df):
    """
    TODO: modify function to be dynamic. take in table name to insert into as well
          as dataframe to add into the selected table. add data validation to make sure
          df is correctly formatted for table
    """
    cursor = connection.cursor()
    mysql_query = None
    if table == "ucs":
        # df columns: 'contributor_uuid, score'
        
        insert_ucs = "INSERT INTO `ucs` (`uuid`, `score`) VALUES (%s, %s)"
        def ucs_query(row):
            data_ucs = (row['contributor_uuid'], row['score'])
            cursor.execute(insert_ucs, data_ucs)
            return row
        mysql_query = ucs_query
        
        # cursor.execute('SELECT * from ucs')
    elif table == "task_scores":
        # df columns: 'quiz_task_uuid, contributor_uuid, score'
        
        insert_task_scores = "INSERT INTO task_scores (ts, quiz_task_uuid, user_uuid, task_score) VALUES (%s, %s, %s, %s)"  
        def task_scores_query(row):
            ts = time.strftime("%Y-%m-%d %H:%M:%S")
            quiz_task_uuid = row['quiz_task_uuid']
            contributor_uuid = row['contributor_uuid']
            score = row['score']
            data_task_scores = (ts, quiz_task_uuid, contributor_uuid, score)
            cursor.execute(insert_task_scores, data_task_scores)
            return row
        mysql_query = task_scores_query
        
        # cursor.execute('SELECT * from task_scores')
    elif table == "datahunt_tracker":
        # df columns: 'datahunt_id, num_rows_processed'
        
        insert_datahunt_tracker = "INSERT INTO datahunt_tracker (datahunt_id, num_rows_processed) VALUES (%s, %s)"
        def datahunt_tracker_query(row):
            datahunt_id = row['datahunt_id']
            num_rows_processed = row['num_rows_processed']
            data_datahunt_tracker = (datahunt_id, num_rows_processed)
            cursor.execute(insert_datahunt_tracker, data_datahunt_tracker)
            return row
        
        # cursor.execute('SELECT * from datahunt_tracker')
    
    # run the appropriate query on each row of the given dataframe
    df = df.apply(mysql_query, axis=1)
    connection.commit()
    
    display_table(table)

In [141]:
display_table('task_scores')

task_scores


Unnamed: 0,ts,quiz_task_uuid,user_uuid,task_score
0,2022-04-02 17:59:18,1,10,0.6
1,2022-04-02 17:59:18,2,20,0.8


In [142]:
table_to_csv('task_scores')

### Creating Task Scores
This part of the notebook will include steps 2 and 3 from the User Monitoring Pipeline, which includes identifying the consensus answer from the IAA and Gold Standard data (step 2) and creating the corresponding task scores for users who completed this task (step 3).

## Hardcoded Evidence Schema
Most of this information will be in some sort of schema file (see file 'Evidence2021_05_19-Schema.csv'), but I'm not sure where the schema file is for this specific set of tasks. Thus, I hard coded it with the schema data from https://github.com/Goodly/PEUserMonitoring/blob/master/task-schema/Evidence.txt. Getting this information with the right schema file should be fairly straightforward.


__TODO__: Get schema file and implement method to retrieve the set of scored questions as well as a nested dictionary represented the question schema.

In [40]:
scored_questions = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}

question_schema = {1:{'type':'select_one_nominal', 'num_choices':3},
           2:{'type':'select_all', 'num_choices':9},
           3:{'type':'select_one_nominal', 'num_choices':1},
           4:{'type':'select_one_ordinal', 'num_choices':6},
           5:{'type':'select_one_nominal', 'num_choices':5},
           6:{'type':'select_one_nominal', 'num_choices':3},
           7:{'type':'select_one_ordinal', 'num_choices':1},
           8:{'type':'select_one_ordinal', 'num_choices':5},
           9:{'type':'select_one_ordinal', 'num_choices':3},
           10:{'type':'select_one_ordinal', 'num_choices':5},
           11:{'type':'select_one_ordinal', 'num_choices':5},
           12:{'type':'select_one_ordinal', 'num_choices':4},
           13:{'type':'select_one_ordinal', 'num_choices':10},
           14:{'type':'select_one_ordinal', 'num_choices':10}}

## Preprocessing of IAA and Gold Standard Data

In [41]:
# read in the data
adjudicated = pd.read_csv('evidence_eric/evidence_eric/Covid_Evidence2020_03_21.adjudicated-edb1510f-1923-4d6f-a678-95f53d752bea-Tags.csv')
iaa = pd.read_csv('evidence_eric/evidence_eric/Covid_Evidencev1.IAA-edb1510f-1923-4d6f-a678-95f53d752bea-Tags.csv')

# getting rid of some rows where the answer was invalid, probably represents some other metadata
iaa = iaa[iaa.answer_uuid.str.len() > 3]

In [42]:
# these are the only relevant columns for scoring for now, notice highlight data is not included here
cols = ['answer_uuid', 'question_Number', 'agreed_Answer']

# getting rid of some rows where the above columns were the same, this may represent different 
# highlights for the same question and answer?
adjudicated = adjudicated[cols].drop_duplicates()
iaa = iaa[cols].drop_duplicates()

Below cells just show the format of the preprocessed IAA and Adjudicated / Gold Standard data.

In [43]:
adjudicated.head(3)

Unnamed: 0,answer_uuid,question_Number,agreed_Answer
0,73d7a14a-9ec6-404c-b2b7-a55508af3b76,1,1
1,5a1fb1f4-d8b7-45c0-bce5-7d4c3b91c55f,2,1
3,ba2d1638-2509-4ce8-9130-39ea26d1d424,2,2


In [44]:
iaa.head(3)

Unnamed: 0,answer_uuid,question_Number,agreed_Answer
0,73d7a14a-9ec6-404c-b2b7-a55508af3b76,1,1
3,5a1fb1f4-d8b7-45c0-bce5-7d4c3b91c55f,2,1
4,ba2d1638-2509-4ce8-9130-39ea26d1d424,2,2


## Creating The Answer Key

In [45]:
# consensus answer key
consensus_answers = {}

def get_answer(question, answer_source):
    """
    Take in the question and the answer_source, either IAA or Adjudicated / Gold Standard, and adds the
    converged consensus answer to the consensus_answer answer key. This will be an single
    int for select_one questions, or a list of ints for select_all questions.
    """
    question_type = question_schema[question]['type']
    
    if question_type == 'select_one_nominal' or question_type == 'select_one_ordinal':
        assert len(answer_source[answer_source.question_Number == question].agreed_Answer) == 1
        consensus_answers[question] = answer_source[answer_source.question_Number == question].agreed_Answer.iloc[0]
    elif question_type == 'select_all':
        consensus_answers[question] = list(answer_source[answer_source.question_Number == question].agreed_Answer)
    else:
        raise ValueError('Invalid question type')

In [46]:
# create a set of questions that the Adjudicated / Gold Standard data determined converged to a consensus
adjudicated_consensus_questions = set(adjudicated.question_Number)

# create a set of questions that the IAA data determined converged to a consensus
iaa_consensus_questions = set(iaa.question_Number)

# uses get_answer function to fill in the consensus_answers answer key
for question in scored_questions:
    if question in adjudicated_consensus_questions:
        get_answer(question, adjudicated)
    elif question in iaa_consensus_questions:
        get_answer(question, iaa)
    else:
        consensus_answers[question] = -1

What the consensus key looks like:

In [47]:
consensus_answers

{1: 1,
 2: [1, 2, 3, 5],
 3: -1,
 4: 2,
 5: 5,
 6: '3',
 7: 1,
 8: 4,
 9: 1,
 10: 4,
 11: 4,
 12: 2}

## Scoring Users

In [48]:
def scoring_select_one_nominal(question, answer):
    """
    Takes in a question and the selected answer, returns a score of 0 if the consensus 
    answer is different, and 1 if the consensus answer is the same.
    """
    consensus_answer = consensus_answers[question]
    return int(consensus_answer == answer)

In [49]:
def scoring_select_one_ordinal(question, answer):
    """
    Takes in a question and the selected answer, returns a score between 0 and 1 depending
    on how far off the answer is from the consensus answer.
    """
    consensus_answer = consensus_answers[question]
    num_choices = question_schema[question]['num_choices']
    return 1 - (abs(answer - consensus_answer) / num_choices)

In [50]:
def scoring_select_all(question, answer_list):
    """
    Takes in a question and the selected answer, returns a score between 0 and 1 depending
    on the accuracy ((True Positive + True Negative) / Total) of the answer selections
    compared to the consensus answer selections.
    """
    answer_set = set(answer_list)
    consensus_answer_set = set(consensus_answers[question])
    num_choices = question_schema[question]['num_choices']
    
    total_correct = 0
    for answer in range(1, num_choices+1):
        if (answer in answer_set) and (answer in consensus_answer_set):
            total_correct += 1
        elif (answer not in answer_set) and (answer not in consensus_answer_set):
            total_correct += 1
        else:
            total_correct += 0
        
    return total_correct / num_choices

In [51]:
def scoring(row):
    """
    This is a Pandas apply function, to be applied on axis=1 (on each row).
    Makes a call to one of scoring_select_one_nominal, scoring_select_one_ordinal, and
    scoring_select_all depending on the type of question, returns the outputted score.
    
    An important note is that right now if neither IAA nor Gold Standard have a consensus
    answer for a question, the consensus_answers answer key will contain a -1 for that 
    question. I currently assume this question should not have been answered due to it
    being a child-question from an incorrectly answered parent question, so I score it
    """
    question = int(row['question_label'])
    answer_list = [int(i) for i in row['answer_label']]
    
    if consensus_answers[question] == -1:
        return 0
    
    question_type = question_schema[question]['type']
    if question_type == 'select_one_nominal':
        return scoring_select_one_nominal(question, answer_list[0])
    elif question_type == 'select_one_ordinal':
        return scoring_select_one_ordinal(question, answer_list[0])
    elif question_type == 'select_all':
        return scoring_select_all(question, answer_list)
    else:
        raise ValueError('Invalid question type')

In [52]:
# read in the datahunt
df_full = pd.read_csv('evidence_eric/evidence_eric/Covid_Evidencev1-Task-2224-DataHunt.csv')

In [53]:
df_full

Unnamed: 0,namespace,schema_sha256,quiz_task_uuid,task_url,tua_uuid,article_batch_name,article_number,article_filename,article_sha256,article_text_length,...,answer_label,answer_content,answer_uuid,submitted_tua_uuid,answer_text,case_number,highlight_count,start_pos,end_pos,target_text
0,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q1.A1,"General Causation (In general, X causes Y.)",73d7a14a-9ec6-404c-b2b7-a55508af3b76,50a87210-bcda-459b-9be6-5587a1459012,"General Causation (In general, X causes Y.)",1,1,7,57,Social distancing comes with psychological fal...
1,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q2.A1,Correlation,5a1fb1f4-d8b7-45c0-bce5-7d4c3b91c55f,09412d04-f88d-4328-a8c0-556bdc47d669,Correlation,1,1,1401,1937,Many\xa0quarantined individuals experienced bo...
2,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q2.A2,Cause precedes effect,ba2d1638-2509-4ce8-9130-39ea26d1d424,b1f0cf0f-a6ae-4fb1-8e6d-551b652c88f4,Cause precedes effect,1,1,1213,1266,psychological outcomes of people who were quar...
3,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q2.A3,The correlation appears across multiple indepe...,fee5e5ec-ce13-43e7-aca6-5babee4eb8a5,170e0c43-a9f0-447c-aa35-4be7864e1dcf,The correlation appears across multiple indepe...,1,1,1326,1399,"SARS, H1N1 flu, Ebola and other infectious dis..."
4,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q4.A6,Can't tell; not enough info,7369a857-fd09-4a36-8e26-e64e8d5a6641,44fc7d46-3135-41c9-a2ee-244e76de09af,Can't tell; not enough info,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q10.A5,Very Unlikely,443d94fe-c0a2-4a8b-819f-9253a570f788,d0db2afe-8fba-4f2a-b0fc-86a9182a2627,Very Unlikely,0,0,0,0,
769,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q11.A4,Somewhat Unlikely,6a8b58aa-2748-4697-995b-c1a34149f70d,c934c5a4-eb4d-4fa5-9d7c-411d5d566b14,Somewhat Unlikely,0,0,0,0,
770,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q12.A2,"Yes, implicitly",a0972088-cb65-4764-a4ee-3be2b2bd100a,bcb8cfd5-5a02-4c96-babe-8af03b0fa1e9,"Yes, implicitly",1,1,1023,1106,Research on the psychological toll of social d...
771,Covid_Evidence2020_03_21,45dce5251bd3ea6e908fa33ac9e6a8e17e6830215912ce...,edb1510f-1923-4d6f-a678-95f53d752bea,https://pe.goodlylabs.org/project/Covid_Eviden...,a723537a-f11a-41dd-bf5b-668cef67a5de,CovidArticles/Covid_article_for_PE_S&S&S.txt,100059,Covid_article_for_PE_S&S&S.txt,4b537e0ed21179a29ed28da28057d338e67330ae12123c...,6794,...,T1.Q13.A6,6,c296eb9d-def0-4a2e-91a4-f27215a4333c,3f7c3cfe-c478-4d76-a21a-d22e005c96a1,6,0,0,0,0,


In [54]:
# narrow down the datahunt to the relevant columns for scoring, getting rid of some rows
# where the data for the below columns were the same, this may represent different highlights 
# for the same question and answer? not certain.
df = df_full[['contributor_uuid', 'question_label', 'answer_label']].drop_duplicates()

# the question and answer labels in the datahunt are in the form 'T1.QX' and 'T1.QX.AX'
# the below lines strip down to only question number and answer number
df['question_label'] = df['question_label'].str.split('Q').str[1].astype(int)
df['answer_label'] = df['answer_label'].str.split('A').str[1]

In [55]:
df

Unnamed: 0,contributor_uuid,question_label,answer_label
0,e1ae8875-a398-4dde-8f4e-4b21109784e3,1,1
1,e1ae8875-a398-4dde-8f4e-4b21109784e3,2,1
2,e1ae8875-a398-4dde-8f4e-4b21109784e3,2,2
3,e1ae8875-a398-4dde-8f4e-4b21109784e3,2,3
4,e1ae8875-a398-4dde-8f4e-4b21109784e3,4,6
...,...,...,...
768,bd786026-bad5-4fa8-9a3a-38ca03a16412,10,5
769,bd786026-bad5-4fa8-9a3a-38ca03a16412,11,4
770,bd786026-bad5-4fa8-9a3a-38ca03a16412,12,2
771,bd786026-bad5-4fa8-9a3a-38ca03a16412,13,6


In [56]:
# we want to groupby contributor_uuid and question_label to get all the answers a user
# selected for a particular question, to account for select_all questions. Now, the
# granularity of df_grouped will be one row per contributor answering a question.
df_grouped = df.groupby(['contributor_uuid', 'question_label']).agg(list).reset_index()

In [57]:
df_grouped

Unnamed: 0,contributor_uuid,question_label,answer_label
0,00f548b7-6b63-4b47-828e-8e416b6ca0e2,1,[1]
1,00f548b7-6b63-4b47-828e-8e416b6ca0e2,2,"[3, 5, 8, 4]"
2,00f548b7-6b63-4b47-828e-8e416b6ca0e2,3,[1]
3,00f548b7-6b63-4b47-828e-8e416b6ca0e2,4,[6]
4,00f548b7-6b63-4b47-828e-8e416b6ca0e2,5,[2]
...,...,...,...
549,fd6f6837-7881-4943-8129-b7ea0f0fe1b6,10,[4]
550,fd6f6837-7881-4943-8129-b7ea0f0fe1b6,11,[4]
551,fd6f6837-7881-4943-8129-b7ea0f0fe1b6,12,[2]
552,fd6f6837-7881-4943-8129-b7ea0f0fe1b6,13,[2]


In [58]:
# we only want to score the rows with scored questions (not survey questions like 13 and 14)
# so we'll filter those out
df_grouped = df_grouped[df_grouped.question_label.isin(scored_questions)]

In [59]:
df_grouped

Unnamed: 0,contributor_uuid,question_label,answer_label
0,00f548b7-6b63-4b47-828e-8e416b6ca0e2,1,[1]
1,00f548b7-6b63-4b47-828e-8e416b6ca0e2,2,"[3, 5, 8, 4]"
2,00f548b7-6b63-4b47-828e-8e416b6ca0e2,3,[1]
3,00f548b7-6b63-4b47-828e-8e416b6ca0e2,4,[6]
4,00f548b7-6b63-4b47-828e-8e416b6ca0e2,5,[2]
...,...,...,...
547,fd6f6837-7881-4943-8129-b7ea0f0fe1b6,8,[5]
548,fd6f6837-7881-4943-8129-b7ea0f0fe1b6,9,[1]
549,fd6f6837-7881-4943-8129-b7ea0f0fe1b6,10,[4]
550,fd6f6837-7881-4943-8129-b7ea0f0fe1b6,11,[4]


In [60]:
# using the scoring function defined above, we'll create a new column containing the scores
# for each contributor answering a question.
df_grouped['score'] = df_grouped.apply(scoring, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_grouped['score'] = df_grouped.apply(scoring, axis=1)


This is the current format of df_grouped.

In [61]:
df_grouped.head(3)

Unnamed: 0,contributor_uuid,question_label,answer_label,score
0,00f548b7-6b63-4b47-828e-8e416b6ca0e2,1,[1],1.0
1,00f548b7-6b63-4b47-828e-8e416b6ca0e2,2,"[3, 5, 8, 4]",0.555556
2,00f548b7-6b63-4b47-828e-8e416b6ca0e2,3,[1],0.0


In [62]:
# lastly, we want to get the average score for all task responses, this will be their
# task score. this is done by a simple groupby on contributor_uuid and mean() aggregate function
calculated_task_scores = df_grouped[['contributor_uuid', 'score']].groupby('contributor_uuid').mean()

This is what the final task_scores output will look like:

In [63]:
calculated_task_scores

Unnamed: 0_level_0,score
contributor_uuid,Unnamed: 1_level_1
00f548b7-6b63-4b47-828e-8e416b6ca0e2,0.603241
070268de-067c-463b-9ad3-5c88292d881e,0.878889
082a8363-a579-41b4-8918-c166fec3a3a4,0.513333
09df3ada-e5a8-4419-b78a-e0d1e9b37484,0.277778
09f279ac-1c34-4a84-8972-3d92b93605a7,0.361111
0c22ce7c-4641-4bb1-97f4-7a7355f70f25,0.831481
0e51ab2d-1a03-4d18-be33-fd21a829d19b,0.75463
1b69eee8-ab95-49dd-8979-9fff7655964d,0.277778
21ffd986-c219-43a0-b82f-4cc460da628d,0.831481
24640f45-b90b-40dc-a848-9e03fdfbbf91,0.588889


The below value is the number of rows processed in this datahunt, to be used for updating the datahunt tracking table.

In [64]:
rows_processed = len(df_full)
rows_processed

773

The below value represents an identifier for each datahunt, meaning an identifier for each task since each datahunt corresponds to one task.

In [65]:
quiz_task_uuid = df_full['quiz_task_uuid'][0]
quiz_task_uuid

'edb1510f-1923-4d6f-a678-95f53d752bea'

## Updating the User-Monitoring Database

In [71]:
# ucs function that reads accepts values from a csv and a current user's ucs score
def ucs_update_score(user_id, cur_ucs):
    def logistic(x, k, offset):
        return 1 / (1 + np.e**(-k * (x - offset)))
    # each row of the csv corresponds to one user, with the first value being the user_id and the rest being task scores
    def task_scores_from_csv(user_id):
        #      0   1   2   3
        # id1  11  12  13  14
        # id2  21  22  23  24
        # id3  31  32  33  34
        df = pd.read_csv('task_scores.csv', header=None, index_col=0)
        task_scores = df.loc[[user_id]].to_numpy()[0]
        return task_scores
    task_scores = task_scores_from_csv(user_id)
    a = 1000
    num_task_scores = len(task_scores)
    n = min(10, int(np.sqrt(num_task_scores)) + 1)
    if num_task_scores == 0:
        return 0.5
    else:
        var_scores = np.var(task_scores[-n:])
        c = logistic(var_scores / (np.log(num_task_scores + 1) / (np.log(a))), 10, 0.2)
        new_ucs = cur_ucs * (1 - c) + (c) * task_scores[-1] #task_scores[-1] = latest task score

    return new_ucs

#### Step 1) Insert calculated_task_scores into the task scores table and check that the task_scores table updated properly

#### Step 2) Update User Credibility Score:
Algorithm: Once per row in calculated_task_scores:
--> if contributor_uuid not in 