In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import statsmodels.formula.api as smf

import tasks.postprocessing
import tasks.constants
import tasks.graphs
import tasks.stats

In [2]:
VMAX = 0.2
VMIN = -0.2

tqdm.pandas()

## Load datasets

In [3]:
main_df = tasks.postprocessing.get_main_dataset()
main_df = main_df.iloc[:, [0, 1, 9, 4]]
main_df = main_df.drop_duplicates("message_id")
main_df["turns"] = "random_weighted"
main_df["prompts"] = "Original"
# remove seed comments from analysis
main_df = main_df[main_df.model != "hardcoded"]
main_df

Unnamed: 0,conv_id,message_id,message,model,turns,prompts
11,3bd075d6-b91b-4f60-9138-c33feff56d2c,-1870930993328192561,"@SkepticalInvestor77, you're missing the point...",,random_weighted,Original
22,3bd075d6-b91b-4f60-9138-c33feff56d2c,-1583336252148959998,"@CitySlicker05, thank you for clarifying your ...",,random_weighted,Original
33,3bd075d6-b91b-4f60-9138-c33feff56d2c,838628372202429525,"@SkepticalInvestor77, I understand your perspe...",,random_weighted,Original
44,3bd075d6-b91b-4f60-9138-c33feff56d2c,1852804548951377081,"@GentleTherapist56, thank you for your thought...",,random_weighted,Original
55,3bd075d6-b91b-4f60-9138-c33feff56d2c,-1794484532174709680,"@GentleTherapist56, you're reaching for the mo...",,random_weighted,Original
...,...,...,...,...,...,...
57266,8fd69250-71ad-4b24-9dca-0658d6eb95fc,1972079638350772560,"@WanderlustNomad22, thank you for bringing up ...",Qwen 2.5,random_weighted,Original
57277,8fd69250-71ad-4b24-9dca-0658d6eb95fc,1183976761754781031,I completely agree that addressing systemic is...,Qwen 2.5,random_weighted,Original
57288,8fd69250-71ad-4b24-9dca-0658d6eb95fc,1769825593077317243,"@NatureLover88, thank you for sharing your per...",Qwen 2.5,random_weighted,Original
57299,8fd69250-71ad-4b24-9dca-0658d6eb95fc,1394900428090388415,"@NatureLover88, I totally agree with you about...",Qwen 2.5,random_weighted,Original


In [4]:
abl_df = tasks.postprocessing.get_ablation_df()
abl_df = abl_df.loc[
    :, ["conv_id", "message_id", "message", "turns", "prompts"]
]
abl_df.turns = np.where(
    abl_df.turns.apply(lambda x: str(x) == "nan"),
    "random_weighted",
    abl_df.turns,
)
abl_df.prompts = np.where(
    abl_df.prompts.apply(lambda x: str(x) == "nan"), "original", abl_df.prompts
)
abl_df["model"] = "Qwen 2.5"
abl_df

ValueError: too many values to unpack (expected 2)

Data from CeRI http://archive.regulationroom.org/

Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the CeRI (Cornell e-Rulemaking Initiative).

In [None]:
human_df = tasks.postprocessing.get_human_df()
human_df["model"] = "Human"
human_df["turns"] = "Human"
human_df["prompts"] = "Human"

human_df

In [None]:
df = pd.concat([main_df, abl_df, human_df], ignore_index=True)
df

In [7]:
def prioritize_values(arr, priority=["Human", "Original"]):
    """
    Reorders elements of `arr` to put `priority` items first,
    in order, preserving the rest.
    """
    arr = list(arr)
    rest = [item for item in arr if item not in priority]
    return [item for item in priority if item in arr] + rest

In [None]:
df.message = df.message.astype(str)
df.message = np.where(df.message == "nan", "", df.message)
# rename values for graphs
df.turns = df.turns.map(
    {
        "roundrobin": "Round Robin",
        "random": "Random",
        "random_weighted": "Original",
        "Human": "Human",
        "Original": "Original",
    }
)
df.prompts = df.prompts.map(
    {
        "nosdb": "No SDBs",
        "basicinstructions": "Basic Instructions",
        "original": "Original",
        "Original": "Original",
        "noroles": "No roles",
        "Human": "Human",
    }
)

# keep constistent hues across graphs
model_hue_order = prioritize_values(df.model.unique())
prompts_hue_order = prioritize_values(df.prompts.unique())
turns_hue_order = prioritize_values(df.turns.unique())

df = df.rename(
    {
        "turns": "Turn taking function",
        "prompts": "Instruction prompt",
        "model": "Model",
    },
    axis=1,
)
df

### Comment length

In [None]:
len_df = df
len_df["comment_length"] = df.message.apply(lambda x: len(x.split()))

len_df.sort_values("comment_length", ascending=False).head(10)

In [None]:
len_df.comment_length.describe()

In [11]:
len_df.comment_length = len_df.comment_length.clip(upper=400)
len_df = len_df[len_df.comment_length > 0]

In [None]:
tasks.graphs.comment_len_plot(
    len_df,
    length_col="comment_length",
    feature_col="Turn taking function",
    hue_order=turns_hue_order,
)
tasks.graphs.save_plot(
    tasks.constants.GRAPH_OUTPUT_DIR / "comment_len_turns.png"
)

In [None]:
tasks.graphs.comment_len_plot(
    len_df,
    length_col="comment_length",
    feature_col="Model",
    hue_order=model_hue_order,
)
tasks.graphs.save_plot(
    tasks.constants.GRAPH_OUTPUT_DIR / "comment_len_model.png"
)

In [None]:
tasks.graphs.comment_len_plot(
    len_df,
    length_col="comment_length",
    feature_col="Instruction prompt",
    hue_order=prompts_hue_order,
)
tasks.graphs.save_plot(
    tasks.constants.GRAPH_OUTPUT_DIR / "comment_len_prompts.png"
)

### Diversity

In [None]:
similarity_df = df.copy()
# delete @ usernames
similarity_df.message = similarity_df.message.apply(
    lambda msg: " ".join(
        word for word in msg.split() if not word.startswith("@")
    )
)
similarity_df = (
    similarity_df.groupby(
        ["conv_id", "Model", "Instruction prompt", "Turn taking function"]
    )["message"]
    .apply(lambda messages: messages.tolist())
    .reset_index()
)
comments_ls = similarity_df["message"].tolist()
similarity_df["rougel_similarity"] = tasks.stats.rougel_similarity(comments_ls)
similarity_df = similarity_df[~similarity_df.rougel_similarity.isnull()]
similarity_df.rougel_similarity.describe()

In [None]:
tasks.graphs.rougel_plot(
    df=similarity_df,
    rougel_col="rougel_similarity",
    feature_col="Model",
    hue_order=model_hue_order,
)
tasks.graphs.save_plot(tasks.constants.GRAPH_OUTPUT_DIR / "rougel_model.png")

In [None]:
tasks.graphs.rougel_plot(
    df=similarity_df,
    rougel_col="rougel_similarity",
    feature_col="Instruction prompt",
    hue_order=prompts_hue_order,
)
tasks.graphs.save_plot(tasks.constants.GRAPH_OUTPUT_DIR / "rougel_prompts.png")

In [None]:
tasks.graphs.rougel_plot(
    df=similarity_df,
    rougel_col="rougel_similarity",
    feature_col="Turn taking function",
    hue_order=turns_hue_order
)
tasks.graphs.save_plot(
    tasks.constants.GRAPH_OUTPUT_DIR / "rougel_turns.png"
)

## Statistical tests

In [None]:
tasks.stats.mean_comp_test(
    df=similarity_df, feature_col="Model", score_col="rougel_similarity"
)

In [None]:
tasks.graphs.posthoc_heatmap(
    similarity_df, "rougel_similarity", group_col="Model", vmin=VMIN, vmax=VMAX
)

In [None]:
tasks.stats.mean_comp_test(
    df=similarity_df,
    feature_col="Instruction prompt",
    score_col="rougel_similarity",
)

In [None]:
tasks.graphs.posthoc_heatmap(
    similarity_df,
    "rougel_similarity",
    group_col="Instruction prompt",
    vmin=VMIN,
    vmax=VMAX,
)

In [None]:
tasks.stats.mean_comp_test(
    df=similarity_df,
    feature_col="Turn taking function",
    score_col="rougel_similarity",
)

In [None]:
tasks.graphs.posthoc_heatmap(
    similarity_df,
    "rougel_similarity",
    group_col="Turn taking function",
    vmin=VMIN,
    vmax=VMAX,
)

## Diversity x Comment length correlation

In [None]:
conv_len_df = (
    len_df.groupby(["conv_id", "Model"])["comment_length"].sum().reset_index()
)
conv_len_df

In [None]:
corr_df = conv_len_df.merge(
    similarity_df, on="conv_id"
)
corr_df["human"] = corr_df.Model_x == "Human"
corr_df = corr_df.loc[:, ["rougel_similarity", "comment_length", "human"]]
corr_df

In [None]:
model = smf.ols(
    "rougel_similarity ~ comment_length : C(human)",
    data=corr_df,
)
# Fit the model
result = model.fit()
result.summary()