**NOTE**: to get this working, you must install the `data-analysis` group dependencies using Poetry:

```shell
poetry install --with=data-analysis
```

You should also run this if you are using git:

```shell
poetry run nbstripout --install
```

This will automatically clear output when committing to git :)

In [None]:
import json
import sqlite3
from operator import itemgetter

import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

from data_analysis_utils import agreement_as_label

Ways to index and group data:

In [None]:
# by each Java source code file (unit):
# TODO: rename to "context"?
BY_UNIT = ["srcml_path", "version"]
# by the PEM variant shown to the rater:
BY_SCENARIO = BY_UNIT + ["variant"]
# by the rater: this uniquely identifies one particular data point:
BY_RATER = BY_SCENARIO + ["rater"]

We'll create a basic dataframe from `answers.sqlite3`. However, the `answers` column will need to be parsed as JSON.

In [None]:
# Read answers.sqlite3 into a DataFrame
conn = sqlite3.connect("answers.sqlite3")
df = pd.read_sql_query("SELECT * FROM answers", conn)
conn.close()
df

Now time to clean the data — extract it from that JSON column.

Instead of creating a nice schema in `answers.sqlite3`, I decided to defer the job of making nice columns to the data analysis stage. So now we have to parse the answers column as JSON and extract data.


As of 2023-05-02, these are columns:

In [None]:
json.loads(df["answers"][0])

In [None]:
def json_to_columns(df):
    json_column = df["answers"].apply(json.loads)
    return df.assign(
        jargon=(
            json_column.apply(itemgetter("jargon")).astype(int)
        ),
        sentence_structure=(
            json_column.apply(itemgetter("sentence_structure")).astype("category")
        ),
        # Explanation should already be a boolean column, that is completely filled in:
        explanation=(
            json_column.apply(itemgetter("explanation"))
        ),
        explanation_correctness=(
            json_column.apply(itemgetter("explanation_correctness")).astype("category")
        ),
        # I wish I had a better name for this column, but it's basically, "if the explanation is MAYBE correct, WHY is it maybe correct?"
        explanation_maybe=(
            json_column.apply(itemgetter("explanation_maybe")).astype("category")
        ),
        fix=(
            json_column.apply(itemgetter("fix")).fillna("N/A").astype("category")
        ),
        fix_correctness=(
            json_column.apply(itemgetter("fix_correctness")).astype("category")
        ),
        additional_errors=(
            json_column.apply(itemgetter("additional_errors")).astype("category")
        ),
        notes=(
            json_column.apply(itemgetter("notes")).astype("string")
        ),
        length=(
            json_column.apply(itemgetter("length")).astype(int)
        ),
    )

def variant_as_categorical(df):
    variant = df["variant"].astype("category")
    assert len(variant.cat.categories) == 4
    return df


def rater_as_categorical(df):
    rater = df["rater"].astype("category")
    assert len(rater.cat.categories) == 3
    return df.assign(rater=rater)


def set_empty_notes_to_na(df):
    "Notes that are empty strings should just be missing values"
    return df.assign(notes=df["notes"].replace("", pd.NA))

expanded_df = df\
    .pipe(json_to_columns)\
    .drop(columns=["answers"])\
    .pipe(variant_as_categorical)\
    .pipe(rater_as_categorical)\
    .pipe(set_empty_notes_to_na)

expanded_df.sample(5)

In [None]:
expanded_df.info()

What are the levels of the categorical variables?

In [None]:
# Get the levels for each categorical variable in df
def get_levels(df, skip=()):
    for column in df.select_dtypes("category").columns:
        if column in skip:
            continue
        print(df[column].value_counts())
        print(set(df[column].cat.categories))
        print()


get_levels(expanded_df, skip=["variant", "rater"])

For further data analysis, it's useful to think of some of the categorical responses as being ordinal. It will also smooth over inter-rater reliability (you could argue that it's cooking the books), since if two raters answer "yes" and "maybe", that's more agreement than two raters saying "yes" and "no".

There are also some columns that can be made binary, like "does it provide a correct fix?" or "is the explanation definitely correct?"

In [None]:
REMAPPINGS = dict(
    sentence_structure_ordinal={"unclear": 0, "could-be-clearer": 1, "clear": 2},
    explanation_correctness_ordinal={"no": 0, "maybe": 1, "yes": 2},
    fix_ordinal={
        "no": 0,
        "implicit-suggestion": 1,
        "generic": 2,
        "hint": 3,
        "confident": 4,
    },
    fix_correctness_ordinal={"no": 0, "maybe": 1, "yes": 2},
    additional_errors_ordinal={"no": 0, "maybe": 1, "yes": 2},
    # Is the sentence structure clear?
    sentence_structure_binary={"unclear": 0, "could-be-clearer": 0, "clear": 1},
    # Can the explanation be reasonably considered to be correct?
    explanation_correctness_binary={"no": 0, "maybe": 1, "yes": 1},
    # Is **any** kind of fix suggested?
    fix_binary_levels={
        "no": 0,
        "implicit-suggestion": 1,
        "generic": 1,
        "hint": 1,
        "confident": 1,
    },
    # Is the fix DEFINITELY correct?
    fix_correctness_binary={"no": 0, "maybe": 0, "yes": 1},
    additional_errors_binary={"no": 0, "maybe": 1, "yes": 1},
)

def with_correct_dtype(df, column_name):
    return df[column_name].map(REMAPPINGS[column_name])


def add_ordinal_columns(df):
    return df.assign(
        sentence_structure_ordinal=df["sentence_structure"].map(REMAPPINGS["sentence_structure_ordinal"]),
        explanation_correctness_ordinal=df["explanation_correctness"].map(REMAPPINGS["explanation_correctness_ordinal"]),
        fix_ordinal=df["fix"].map(REMAPPINGS["fix_ordinal"]),
        fix_correctness_ordinal=df["fix_correctness"].map(REMAPPINGS["fix_correctness_ordinal"]),
        additional_errors_ordinal=df["additional_errors"].map(REMAPPINGS["additional_errors_ordinal"]),
    )


def add_binary_columns(df):
    return df.assign(
        sentence_structure_binary=df["sentence_structure"].map(REMAPPINGS["sentence_structure_binary"]),
        explanation_correctness_binary=df["explanation_correctness"].map(REMAPPINGS["explanation_correctness_binary"]),
        fix_binary=df["fix"].map(REMAPPINGS["fix_binary_levels"]),
        fix_correctness_binary=df["fix_correctness"].map(REMAPPINGS["fix_correctness_binary"]),
        additional_errors_binary=df["additional_errors"].map(REMAPPINGS["additional_errors_binary"]),
    )


full_df = expanded_df\
    .pipe(add_ordinal_columns)\
    .pipe(add_binary_columns)

full_df.sample(5)

Every rating **must** state whether an explanation was provided or not. Assert this here:

In [None]:
assert full_df["explanation"].isna().sum() == 0

The length should be the equal regardless of the rater, so let's assert that here and get a table we can join with later.

In [None]:
def assert_lengths(df):
    "Ensure that each rater has seen a message with the same length."
    assert (df["length"]["min"] == df["length"]["max"]).all()
    return df


def as_sql_style_table(df):
    return pd.DataFrame({
        "srcml_path": df["srcml_path"],
        "version": df["version"],
        "variant": df["variant"],
        "length": df["length"]["min"],
    })


# A common pitfall of using a categorical variable with groupby
# is that it does "some sort of cartesian product" with the other columns,
# which is not what we want.
# So we have to set `observed=False` to avoid this.
# 
# See: https://stackoverflow.com/a/67645084
message_lengths = full_df\
    .groupby(BY_SCENARIO, as_index=False, observed=False)\
    .agg({"length": ["min", "max"]})\
    .pipe(assert_lengths)\
    .pipe(as_sql_style_table)

message_lengths.sample(5)

In [None]:
full_df.info()

Assert that all scenarios were answered by at least two raters:

In [None]:
two_ratings = full_df.groupby(["srcml_path", "version"]).filter(lambda x: len(x["rater"].unique()) >= 2)
assert len(two_ratings) == len(full_df), "Had fewer than two ratings for some scenarios"

Combine the ratings of all of raters into one, big table

In [None]:
FIXED_COLUMNS = {'srcml_path', 'version', 'variant', 'rater', 'length'}

RESPONSE_COLUMNS = [
    column for column in full_df.columns
    if column not in FIXED_COLUMNS
]
RESPONSE_COLUMNS

In [None]:
def add_assigned(df):
    """
    Add convenience columns that indicates whether a rater was assigned to this scenario.
    This only makes sense AFTER pivoting the DataFrame.
    """
    # A rater has given True or False if and only if they were assigned to this scenario.
    # Therefore, we can use the presence of NaNs to determine whether a rater was assigned:
    assigned = df["explanation"].notna()
    # However... we need to reconstruct the hierarchical index before we concatentate it.
    index = pd.MultiIndex.from_product([["assigned"], assigned.columns])
    assigned_df = pd.DataFrame(assigned.values, columns=index, index=assigned.index)
    # I tried to do this using DataFrame.assign, but this doesn't work for hierarchical indexes.
    return pd.concat([df, assigned_df], axis=1)

# Put all the raters side-by-side using a hierarchical index
ratings = full_df\
    .drop(columns=["length"])\
    .pivot(index=BY_SCENARIO, columns="rater", values=RESPONSE_COLUMNS)\
    .pipe(add_assigned)

ratings.head(8)

It's nice to see this data in an Excel spreadsheet, so OPTIONALLY export it:

In [None]:
if globals().get("EXPORT_EXCEL", False):
    ratings.to_excel("hierarchical_responses.xlsx")

**TODO**: perhaps Scott's π is a better measure of agreement for this data:

See: https://stats.stackexchange.com/a/525640

In [None]:
def group_two_raters(rater1, rater2):
    "Returns a DataFrame with only the rows where both raters have rated the scenario"
    return ratings[ratings["assigned"][rater1] & ratings["assigned"][rater2]]

def kappa_for_two_raters(column, rater1, rater2):
    "Returns agreement for two raters on a certain column"
    agreement = group_two_raters(rater1, rater2)
    criteria = agreement[column].astype("category")
    return cohen_kappa_score(criteria[rater1], criteria[rater2])

def kappa_for_two_raters_with_label(column, rater1, rater2):
    kappa = kappa_for_two_raters(column, rater1, rater2)
    return kappa, agreement_as_label(kappa)

In [None]:
kappa_for_two_raters_with_label("explanation", "eddie", "prajish")

In [None]:
kappa_for_two_raters_with_label("explanation", "eddie", "brett")

In [None]:
kappa_for_two_raters_with_label("explanation", "prajish", "brett")

In [None]:
kappa_for_two_raters_with_label("fix", "eddie", "prajish")

In [None]:
kappa_for_two_raters_with_label("fix", "eddie", "brett")

In [None]:
kappa_for_two_raters_with_label("fix", "prajish", "brett")

Synthesise two raters by null-coalescing our three raters:

In [None]:
def null_coalesce_rater1(row):
    # prefer Eddie over Brett
    return row["brett"] if pd.isna(row["eddie"]) else row["eddie"]

def null_coalesce_rater2(row):
    # prefer Prajish over Brett
    return row["brett"] if pd.isna(row["prajish"]) else row["prajish"]

In [None]:
def coallesce_agreement(column, type_="category", **kwargs):
    criteria = ratings[column]

    # We need to correct the dtypes, because pivoting the table
    # introduced missing values, and reverted the data types to object.
    rater1 = to_correct_dtype(criteria.apply(null_coalesce_rater1, axis=1), column)
    rater2 = to_correct_dtype(criteria.apply(null_coalesce_rater2, axis=1), column)

    assert rater1.isnull().sum() == 0
    assert rater2.isnull().sum() == 0

    kappa = cohen_kappa_score(rater1, rater2, **kwargs)
    return kappa, agreement_as_label(kappa)


def to_correct_dtype(column, column_name):
    type_ = column_name.split("_")[-1]
    assert column.isnull().sum() == 0, f"Column {column_name} has missing values"
    if type_ == "binary":
        return column.astype(bool)
    elif column_name == "jargon" or type_ == "ordinal":
        return column.astype(int)
    else:
        return column.astype("category")

In [None]:
NON_NULL_COLUMNS = [
    "jargon",
    "sentence_structure",
    "explanation",
    "fix",
    "additional_errors",
    "notes"
]

In [None]:
coallesce_agreement("jargon", weights="quadratic")

In [None]:
coallesce_agreement("explanation")

In [None]:
coallesce_agreement("fix")

In [None]:
coallesce_agreement("additional_errors")

In [None]:
coallesce_agreement("fix_binary")

For ordinal data, apply weighting to smooth the agreement.

See here for weighting: https://github.com/jmgirard/mReliability/wiki/Weighting-scheme

I think linear weighting is fair. Quadratic is waaaaay too forgiving.

In [None]:
coallesce_agreement("sentence_structure_ordinal", weights="linear")

In [None]:
coallesce_agreement("fix_ordinal", weights="linear")

In [None]:
coallesce_agreement("additional_errors_ordinal", weights="linear")

TODO: How to measure the agreement of columns with nulls?

In [None]:
def column_for_two_raters(df, column_name):
    criteria = df[column_name]
    #rater1 = to_correct_dtype(criteria.apply(null_coalesce_rater1, axis=1), column_name)
    #rater2 = to_correct_dtype(criteria.apply(null_coalesce_rater2, axis=1), column_name)
    rater2 = criteria.apply(null_coalesce_rater2, axis=1)
    rater1 = criteria.apply(null_coalesce_rater1, axis=1)
    index = pd.MultiIndex.from_product([[column_name], ["rater1", "rater2"]])
    return pd.DataFrame(
        {
            (column_name, "rater1"): rater1,
            (column_name, "rater2"): rater2,
        },
        columns=index, index=df.index
    )

two_raters = pd.concat([column_for_two_raters(ratings, column) for column in RESPONSE_COLUMNS], axis=1)
if globals().get("EXPORT_EXCEL", False):
    two_raters.to_excel("two_raters.xlsx")
two_raters.head(8)