# Parsing
Parse json files to dataframe

In [None]:
import zipfile
import pandas as pd
import json

pd.options.display.max_columns = None

df_list = []
with zipfile.ZipFile("gptsurvey_results.zip", "r") as zip_ref:
    for file in zip_ref.namelist():
        with zip_ref.open(file) as json_file:
            data = json.load(json_file)
            row = {}
            row["CourseID"] = data["CourseID"]
            row["task_name"] = data["task_name"]
            row["SciperID"] = data["SciperID"]
            for key in data["results"]:
                row[key] = str(data["results"][key])
            row["server_timestamp"] = data["server_timestamp"]
            df = pd.DataFrame(row, index=[0])
            df_list.append(df)
df = pd.concat(df_list)

# Remove dummy scipers
df = df[~df["SciperID"].str.contains("00000")]
df = df.reset_index(drop=True)

# Hash scipers
df["SciperID"] = df["SciperID"].apply(lambda x: str(hash(x)))

# Convert date 
df["human_timestamp"] = pd.to_datetime(df["server_timestamp"], unit="ms")

df

Parse Likerts

In [None]:
# Convert likert of [[true, false, etc.], [true, false, etc.], etc.] to separate columns of [true, false, etc.]
likert_cols = ["Q1_likert_AI","Q1_likert_human","Q3_likert_AI","Q3_likert_human","Q3_extra_likert_human","Q3_extra_likert_AI"]
def parse_likert(row, likert_cols = likert_cols):
    # For each likert
    for likert in likert_cols:
        # Parse likert array
        likert_array = eval(row[likert])
        # For each row in this likert
        for i in range(len(likert_array)):
            # Get true idx
            for k in range(len(likert_array[i])):
                if likert_array[i][k] == True:
                    row[f"{likert}_ans{i}"] = k
    return row

# Parse likert into separate cols
df = df.apply(parse_likert, axis=1)

# Drop original raw likerts cols
df = df.drop(columns=likert_cols)

# Order some cols alphabetically
df = df.reindex(sorted(df.columns), axis=1)
df = df[["CourseID","task_name","SciperID"] + [c for c in df if c not in ["CourseID","task_name","SciperID"]]]
df

# Stats

In [None]:
stats = df.drop(columns="server_timestamp").describe()
stats

Plot

In [None]:
stats.loc["mean"].plot.bar(yerr=stats.loc['std'])

# Duplicate results
Survey taken twice for same sciper

In [None]:
df[df["SciperID"].duplicated(keep=False)]

# Save to csv

In [None]:
df.to_csv("gptsurvey_results.csv", index=False)