In [None]:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import rgb_to_hsv, hsv_to_rgb, to_rgb
import pandas as pd
import seaborn as sns
import scipy
from teeplot import teeplot as tp

import random


In [None]:
# adapted from https://stackoverflow.com/a/78998203/17332200
def darken(color, amount):
    hue, saturation, value = rgb_to_hsv(to_rgb(color))
    return hsv_to_rgb((hue, saturation, value * amount))


In [None]:
def darken_palette(palette: str, amount: float) -> list:
    palette = sns.color_palette(palette)
    return [darken(color, amount) for color in palette]


In [None]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)


## Get Data


In [None]:
df = pd.read_parquet("https://osf.io/gk2ty/download")


In [None]:
df.columns


## Prep Data


In [None]:
df["Components"] = df["Task"].map(
    {
        "AND": 2,
        "ANDNOT": 3,
        "NAND": 1,
        "NOR": 4,
        "NOT": 1,
        "OR": 3,
        "ORNOT": 2,
        "XOR": 4,
        "EQUALS": 5,
    },
)


In [None]:
df["codes for num tasks"] = df.groupby(
    ["Site", "Lineage Generation Index", "Treatment", "Run ID"],
    observed=True,
)["Is Task Coding Site"].transform("sum")


In [None]:
df["prev codes for num tasks"] = df.groupby(
    ["Site", "Lineage Generation Index", "Treatment", "Run ID"],
    observed=True,
)["Prev Is Task Coding Site"].transform("sum")


In [None]:
df["prev codes for tasks"] = df["prev codes for num tasks"].astype(bool)


In [None]:
df["prev coded for tasks"] = df.groupby(
    ["Site", "Lineage Generation Index", "Treatment", "Run ID"],
    observed=True,
)["Prev Is Task Coding Site Cumulative Count"].transform("sum")
df["prev coded for tasks"] = df["prev coded for tasks"].astype(bool)


In [None]:
df['First Task Generation'] = df['Generation Born'].where(
    df['has task'] != 0,
).groupby(
    [df['Treatment'], df['Run ID'], df['Task']],
    observed=True,
).transform('min')


In [None]:
df["SLIP_INSERTION_BOOL_MASK any"] = df.groupby(
    ["Lineage Generation Index", "Treatment", "Run ID"],
    observed=True,
)["SLIP_INSERTION_BOOL_MASK"].transform("any")


In [None]:
dff = df[
    (df["delta has task"] == 1)
    & (df["Generation Born"] == df["First Task Generation"])
    & (df["Treatment"] == "Slip-duplicate")
    & (~df["prev coded for tasks"])
    # & df["Is Task Coding Site"]
].reset_index(drop=True)
dff


In [None]:
dff.columns


In [None]:
dfy = dff.groupby(
    ["Treatment", "Run ID", "Task"],
    observed=True,
)["Is Task Coding Site"].transform("mean")
print(dfy)

dff["Mean Is Task Coding Site"] = dfy


In [None]:
dfp = dff.astype(
    {"Prev Slip Insertion Cumulative Count": bool},
).groupby(
    ["Treatment", "Run ID", "Task", "Prev Slip Insertion Cumulative Count"],
    observed=True,
).agg(
    {
        "Mean Is Task Coding Site": "first",
        "Is Task Coding Site": "mean",
        "Components": "first",
        "SLIP_INSERTION_BOOL_MASK any": "first",
    },
)

dfp["Is Task Coding Site"] /= dfp["Mean Is Task Coding Site"]
dfp = dfp.reset_index()
dfp


## base


In [None]:
dfp_ = dfp.copy()
dfp_["Prev Slip Insertion Cumulative Count"] = dfp_[
    "Prev Slip Insertion Cumulative Count"
].map(
    {True: "Slip Inserted", False: "Not Slip Inserted"}
)
dfp_


In [None]:
data = dfp_[
    (dfp_["Prev Slip Insertion Cumulative Count"] == "Slip Inserted")
].groupby(
    ["Prev Slip Insertion Cumulative Count", "Run ID", "Is Task Coding Site", "Components", "Task"],
    observed=True,
).median(numeric_only=True).reset_index()
print(len(data))
data


In [None]:
# Perform one-sample Wilcoxon signed-rank test for each "Components" group
# against the null hypothesis median = 1.0
wilcoxon_results = data.groupby("Components").apply(
    lambda x: scipy.stats.wilcoxon(
        x["Is Task Coding Site"] - 1.0,  # shift by 1.0
        alternative='greater',
    )
)

# Create a DataFrame to store the test results
wilcoxon_df = pd.DataFrame(
    wilcoxon_results.tolist(),
    index=wilcoxon_results.index,
    columns=["W-statistic", "p-value"]
)

display(wilcoxon_df)

group_stats = data.groupby(["Components", "Prev Slip Insertion Cumulative Count"])["Is Task Coding Site"].agg(["mean", "std"]).reset_index()
display(group_stats)

group_sizes = data.groupby(["Components", "Prev Slip Insertion Cumulative Count"]).size().reset_index(name='size')
display(group_sizes)


In [None]:
with tp.teed(
    sns.violinplot,
    data=data,
    y="Is Task Coding Site",
    x="Components",
    hue="Components",
    density_norm="width",
    cut=0,
    gap=0.2,
    inner=None,
    legend=False,
    palette="Pastel1",
    teeplot_outexclude="palette",
    teeplot_outattrs={"prevcoding": False},
) as ax:
    sns.boxplot(
        data=data,
        y="Is Task Coding Site",
        x="Components",
        hue="Components",
        gap=0.2,
        notch=True,
        palette=darken_palette("Pastel1", 0.8),
        color="black",
        width=0.5,
        linewidth=1,
        fliersize=0,
        ax=ax,
        legend=False,
    )
    sns.stripplot(
        data=data,
        y="Is Task Coding Site",
        x="Components",
        hue="Components",
        jitter=0.2,
        palette=darken_palette("Pastel1", 0.2),
        size=4,
        alpha=0.3,
        ax=ax,
        legend=False,
    )
    ax.axhline(1, color="black", linestyle="--", lw=1)
    ax.set_ylabel("Coding Site Enrichment\nin Slip-inserted Regions")
    ax.set_xlabel("Novel Task Complexity")
    ax.figure.set_size_inches(3.5, 2.5)

    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    # Add significance annotations based on wilcoxon_df
    y_max = ax.get_ylim()[1]
    x_positions = ax.get_xticks()

    # Loop over each component in sorted order
    for i, comp in enumerate(sorted(wilcoxon_df.index)):
        p_val = wilcoxon_df.loc[comp, 'p-value']
        if p_val < 0.001:
            sig = '***'
        elif p_val < 0.01:
            sig = '**'
        elif p_val < 0.05:
            sig = '*'
        else:
            sig = 'ns'
        ax.text(x_positions[i], y_max * 0.95, sig, ha='center', va='bottom', fontsize=10, color='black')


In [None]:
# Perform one-sample Wilcoxon signed-rank test for each "Components" group
# against the null hypothesis median = 1.0
data["one"] = 1
wilcoxon_results = data.groupby("one").apply(
    lambda x: scipy.stats.wilcoxon(
        x["Is Task Coding Site"] - 1.0,  # shift by 1.0
        alternative='greater',
    )
)

# Create a DataFrame to store the test results
wilcoxon_df = pd.DataFrame(
    wilcoxon_results.tolist(),
    index=wilcoxon_results.index,
    columns=["W-statistic", "p-value"]
)

display(wilcoxon_df)

group_stats = data.groupby(["one", "Prev Slip Insertion Cumulative Count"])["Is Task Coding Site"].agg(["mean", "std"]).reset_index()
display(group_stats)

group_sizes = data.groupby(["one", "Prev Slip Insertion Cumulative Count"]).size().reset_index(name='size')
display(group_sizes)


In [None]:
with tp.teed(
    sns.violinplot,
    data=data,
    y="Is Task Coding Site",
    density_norm="width",
    cut=0,
    gap=0.2,
    inner=None,
    legend=False,
    palette="Pastel1",
    teeplot_outexclude="palette",
    teeplot_outattrs={"prevcoding": False},
) as ax:
    sns.boxplot(
        data=data,
        y="Is Task Coding Site",
        gap=0.2,
        notch=True,
        palette=darken_palette("Pastel1", 0.8),
        color="black",
        width=0.5,
        linewidth=1,
        fliersize=0,
        ax=ax,
        legend=False,
    )
    sns.stripplot(
        data=data,
        y="Is Task Coding Site",
        jitter=0.2,
        palette=darken_palette("Pastel1", 0.2),
        size=4,
        alpha=0.3,
        ax=ax,
        legend=False,
    )
    ax.axhline(1, color="black", linestyle="--", lw=1)
    ax.set_ylabel("Coding Site Enrichment\nin Slip-inserted Regions")
    ax.set_xlabel("Novel Task Complexity")
    ax.figure.set_size_inches(3.5, 2.5)

    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    # Add significance annotations based on wilcoxon_df
    y_max = ax.get_ylim()[1]
    x_positions = ax.get_xticks()

    # Loop over each component in sorted order
    for i, comp in enumerate(sorted(wilcoxon_df.index)):
        p_val = wilcoxon_df.loc[comp, 'p-value']
        if p_val < 0.001:
            sig = '***'
        elif p_val < 0.01:
            sig = '**'
        elif p_val < 0.05:
            sig = '*'
        else:
            sig = 'ns'
        ax.text(x_positions[i], y_max * 0.95, sig, ha='center', va='bottom', fontsize=10, color='black')
