In [1]:
%load_ext autoreload
%autoreload 2
import re

import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats
import matplotlib.pyplot as plt
from tqdm.auto import tqdm


import tasks.postprocessing
import tasks.constants
import tasks.graphs
import tasks.stats


tqdm.pandas()

## Load datasets

In [None]:
main_df = tasks.postprocessing.get_main_dataset()
main_df = main_df.iloc[:, [0, 1, 9, 4]]
main_df = main_df.drop_duplicates("message_id")
main_df["turns"] = "random_weighted"
main_df["prompts"] = "original"
# remove seed comments from analysis
main_df = main_df[main_df.model != "hardcoded"]
main_df

In [None]:
abl_df = tasks.postprocessing.get_ablation_df()
abl_df = abl_df.loc[
    :, ["conv_id", "message_id", "message", "turns", "prompts"]
]
abl_df.turns = np.where(
    abl_df.turns.apply(lambda x: str(x) == "nan"),
    "random_weighted",
    abl_df.turns,
)
abl_df.prompts = np.where(
    abl_df.prompts.apply(lambda x: str(x) == "nan"), "original", abl_df.prompts
)
abl_df["model"] = "Qwen 2.5"
abl_df

Data from CeRI http://archive.regulationroom.org/

Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the CeRI (Cornell e-Rulemaking Initiative).

In [None]:
human_df = tasks.postprocessing.get_human_df()
human_df["model"] = "human"
human_df["turns"] = "human"
human_df["prompts"] = "human"

human_df

In [None]:
df = pd.concat([main_df, abl_df, human_df], ignore_index=True)
df

In [None]:
df.message = df.message.astype(str)
df.message = np.where(df.message == "nan", "", df.message)
df

### Comment length

In [None]:
len_df = df
len_df["comment_length"] = df.message.apply(lambda x: len(x.split()))

len_df.sort_values("comment_length", ascending=False).head(10)

In [None]:
tasks.graphs.comment_len_plot(df, feature_col="turns")

In [None]:
tasks.graphs.comment_len_plot(df, feature_col="model")

In [None]:
tasks.graphs.comment_len_plot(df, feature_col="prompts")

### Diversity

In [None]:
similarity_df = df.copy()
# delete @ usernames
similarity_df.message = similarity_df.message.apply(
    lambda msg: " ".join(
        word for word in msg.split() if not word.startswith("@")
    )
)
similarity_df = (
    similarity_df.groupby(["conv_id", "model", "prompts", "turns"])["message"]
    .apply(lambda messages: messages.tolist())
    .reset_index()
)
messages_list = similarity_df["message"].tolist()
similarity_df["rougel_similarity"] = tasks.stats.rougel_similarity(
    messages_list
)
similarity_df = similarity_df[~similarity_df.rougel_similarity.isnan()]
similarity_df.rougel_similarity.describe()

In [None]:
tasks.graphs.rougel_plot(
    similarity_df.rougel_similarity, feature=similarity_df.model
)

In [None]:
tasks.graphs.rougel_plot(
    similarity_df.rougel_similarity, feature=similarity_df.prompts
)

In [None]:
tasks.graphs.rougel_plot(
    similarity_df.rougel_similarity, feature=similarity_df.turns
)

In [None]:
tasks.stats.mean_comp_test(
    df=similarity_df, feature_col="model", score_col="rougel_similarity"
)

In [None]:
tasks.graphs.posthoc_dunn_heatmap(
    similarity_df, "rougel_similarity", group_col="model"
)

In [None]:
tasks.stats.mean_comp_test(
    df=similarity_df, feature_col="prompts", score_col="rougel_similarity"
)

In [None]:
tasks.graphs.posthoc_dunn_heatmap(
    similarity_df, "rougel_similarity", group_col="prompts"
)

In [None]:
tasks.stats.mean_comp_test(
    df=similarity_df, feature_col="turns", score_col="rougel_similarity"
)

In [None]:
tasks.graphs.posthoc_dunn_heatmap(
    similarity_df, "rougel_similarity", group_col="turns"
)