In [1]:
import numpy as np
import pandas as pd

## Goal: creating csv files that can be used to test convergence algorithm

### 1. Set up some hyper parameters:
    - path: the location you want to store the generated csv of testing data
    - task: one of 'Evidence', 'Language', 'Probability', 'Reasoning'
    - n_articles: int, number of articles in the test
    - n_users: int, number of users in each article
    - n_multiple_choice: int, number of multiple choice questions
    - n_single_choice: int, number of single choice questions
    - random: bool, set to true if you want simulated users generating answers randomly

We assume that all questions have four choices

In [2]:
path = '.'
task = 'Evidence'
n_articles = 3
n_users = 3
n_multiple_choice = 5
n_single_choice = 8
random = False

#Don't change anything below
n_question = n_multiple_choice + n_single_choice
questions = [] 
for i in range(n_multiple_choice):
    questions += ['Q' + str(i) + '.A' + str(j) for j in range(1, 5)]
questions += ['Q' + str(i) for i in range(n_multiple_choice, n_question)]
n_question = len(questions)


### 2. Create the Dataframe 

In [3]:
# Helper function for random selecting users
# Use integers as answer labels for simplicity
def rand_users(n_users, n_single_choice, n_multiple_choice, sig_cutoff=[0.5, 0.5], multi_probs=[0.25, 0.25, 0.25, 0.25]):
    helper = []
    for _ in range(n_users):
        helper += (list(np.random.choice(2, size=n_multiple_choice*4, p=sig_cutoff)) + 
                list(np.random.choice(4, size=n_single_choice, p=multi_probs))) 
    return helper

In [5]:
# Create the dataframe starting with column article_number.
col_article_number = np.array([
    [i] * n_users * n_question for i in range(n_articles) #article numbers are natural numbers for simplicity
]).flatten()
df = pd.DataFrame.from_dict({'article_number': col_article_number})

# Add column quiz_task_uuid
col_quiz_task_uuid = np.array([
    np.array([
        [i] * n_question for i in range(n_users)
    ]).flatten() for _ in range(n_articles)
]).flatten()
df['quiz_taskrun_uuid'] = col_quiz_task_uuid

# Add column 'question_label'
df['question_label'] = questions * n_users * n_articles

# Add column 'answer_label'
if random:
    helper = []
    for _ in range(n_articles):
        helper += rand_users(n_users, n_single_choice, n_multiple_choice) 
    df['answer_label'] = helper
else:
    df['answer_label'] = (list(np.random.choice(2, size=n_multiple_choice*4)) + 
        list(np.random.choice(4, size=n_single_choice))) * n_users * n_articles

In [7]:
df[df['article_number'] == 2]

Unnamed: 0,article_number,quiz_taskrun_uuid,question_label,answer_label
168,2,0,Q0.A1,1
169,2,0,Q0.A2,1
170,2,0,Q0.A3,1
171,2,0,Q0.A4,1
172,2,0,Q1.A1,1
...,...,...,...,...
247,2,2,Q8,3
248,2,2,Q9,1
249,2,2,Q10,2
250,2,2,Q11,1


### 3. Saving as csv

In [None]:
# We have generated all columns needed by convergence code. Now saving as a csv for it to use
df.to_csv(path + '/' + task + '.csv')

## Further improvement

1. Better data generation methods; more than uniform or completely random generatoin.
2. Allow changing numbers of questions for different articles/tasks.