# 1. Transform human data (pre-step)
We use the following file from [(Gilardi et al. 2023)](https://doi.org/10.1073/pnas.2305016120) in this notebook.

- ` ... /mTurk_data/batch_results_topics_final.csv`

This file is available at: https://doi.org/10.7910/DVN/PQYF6M

In [31]:
import pandas as pd
import numpy as np
import polars as pl

In [32]:
from io import StringIO
import re

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score

### Load data and formatting

In [34]:
with open("./original/human-predictions/data/mTurk_data/batch_results_topics_final.csv", "r") as f:
    text = f.read()

In [35]:
# replace all quoted strings with the word 'string'
text = re.sub(r'"[^"]*"', 'string', text)

In [36]:
# delete the rows that do not have exactly N columns
N = 40
new_text = ""
for row in text.splitlines():
    if len(row.split(",")) < N:
        new_text += row
    elif len(row.split(",")) > N:
        print(f"Error: More than {N} columns in a row", row)
    else:
        new_text += row + "\n"

### Create dataframe

In [37]:
f = StringIO(new_text)
df = pl.read_csv(f, dtypes={"Input.status_id": pl.String}, ignore_errors=True)

  df = pl.read_csv(f, dtypes={"Input.status_id": pl.String}, ignore_errors=True)


In [38]:
df = df.select("Input.status_id", "WorkerId", "Input.topic_check", "Input.topic", "Answer.category.label")

In [39]:
# see original codes (extracting tasks with valid ground truth)
df = df.filter(pl.col("Input.topic_check")==True)

In [40]:
df.height

1222

In [41]:
df.null_count()

Input.status_id,WorkerId,Input.topic_check,Input.topic,Answer.category.label
u32,u32,u32,u32,u32
0,0,0,0,0


### Providing correct labels

In [42]:
labels = ["SECTION 230", "TRUMP BAN", "TWITTER SUPPORT", "PLATFORM POLICIES", "COMPLAINT", "OTHER"]

In [43]:
df["Input.topic"].unique().sort()

Input.topic
str
"""general complaint"""
"""personal complaint"""
"""platform policies"""
"""section 230"""
"""trump ban"""
"""twitter support"""


In [44]:
df["Answer.category.label"].unique().sort()

Answer.category.label
str
"""COMPLAINTS"""
"""OTHER"""
"""PLATFORM POLICIES"""
"""SECTION 230"""
"""TRUMP BAN"""
"""TWITTER SUPPORT"""


In [45]:
df = df.with_columns(
	pl.col("Input.topic").str.to_uppercase().alias("Input.topic")
)
df = df.with_columns(
    pl.when(pl.col("Input.topic").str.contains("COMPLAINT")).then(pl.lit("COMPLAINT")).otherwise(pl.col("Input.topic")).alias("Input.topic")
)
df = df.with_columns(
    pl.when(pl.col("Answer.category.label").str.contains("COMPLAINT")).then(pl.lit("COMPLAINT")).otherwise(pl.col("Answer.category.label")).alias("Answer.category.label")
)

In [46]:
df["Input.topic"].unique().sort()

Input.topic
str
"""COMPLAINT"""
"""PLATFORM POLICIES"""
"""SECTION 230"""
"""TRUMP BAN"""
"""TWITTER SUPPORT"""


In [47]:
df["Answer.category.label"].unique().sort()

Answer.category.label
str
"""COMPLAINT"""
"""OTHER"""
"""PLATFORM POLICIES"""
"""SECTION 230"""
"""TRUMP BAN"""
"""TWITTER SUPPORT"""


In [48]:
code_dict = {}
for i, label in enumerate(labels):
    code_dict[label] = i

In [49]:
df = df.with_columns(
    pl.col("Answer.category.label").replace(code_dict).alias("label")
)

In [50]:
df = df.with_columns(
    pl.col("Input.topic").replace(code_dict).alias("gt")
)

In [51]:
df = df.rename(
    {
        "Input.status_id": "task",
        "WorkerId": "worker",
    }
)

In [52]:
df = df.select("task", "worker", "label", "gt")

In [53]:
df

task,worker,label,gt
str,str,str,str
"""1251376725306589184""","""A2T5CROJ0FWIJY""","""3""","""4"""
"""1251376725306589184""","""A3RYI5HXC2MJLN""","""4""","""4"""
"""1303113292475371520""","""A2Q1YS118AO2BP""","""3""","""3"""
"""1303113292475371520""","""A2T5CROJ0FWIJY""","""4""","""3"""
"""1266342527654940672""","""A2YYQJ3JWA2KGD""","""4""","""4"""
…,…,…,…
"""1343390061346369536""","""A2YYQJ3JWA2KGD""","""0""","""0"""
"""1375866710394290176""","""A2JPOXYZM5AJZZ""","""5""","""4"""
"""1375866710394290176""","""A2Q1YS118AO2BP""","""5""","""4"""
"""1366840878233493504""","""A2F48NO7AD9VKK""","""5""","""0"""


In [56]:
!mkdir -p ds1task5

In [57]:
df.write_csv("ds1task5/human_data_with_gt.csv")