## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
pd.set_option('max_colwidth', 400)
pd.set_option('display.max_rows', None)

import re
from sklearn.model_selection import train_test_split

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
DATA_PATH = ""
IRR_PATH = ""
JUDGE_PATH = ""

SEED = 312
LLM_SPLIT = 0.80

## Data

In [None]:
df = pd.read_csv(DATA_PATH)

df = df[df['Assessment ID'].isin([2, 3, 4])]
df = df.reset_index(drop=True)

assert df['XYZ ID'].dtype == 'object'
assert df['XYZ ID'].apply(lambda x: bool(re.fullmatch(r'XYZ25\d{3}', str(x)))).all()
assert df['Assessment ID'].dtype == int
assert df["Assessment ID"].iloc[0] in {2, 3, 4}
assert df['Conversation No.'].dtype == int
assert df['Role'].dtype == 'object'
assert all(role in {"system", "user", "assistant"} for role in df["Role"])
assert df['Response'].dtype == 'object'

df.head()

## Split

In [4]:
df["convo_id"] = (df["Conversation No."]==0).cumsum()

convos = (
    df[["convo_id", "Assessment ID"]]
    .drop_duplicates("convo_id")
    .set_index("convo_id")
)

irr_ids, llm_ids = train_test_split(
    convos.index,
    test_size=LLM_SPLIT,
    stratify=convos["Assessment ID"],
    random_state=SEED,
)

In [5]:
df_irr = df[df["convo_id"].isin(irr_ids)].copy()
df_llm = df[df["convo_id"].isin(llm_ids)].copy()

df_irr.reset_index(drop=True, inplace=True)
df_llm.reset_index(drop=True, inplace=True)

print(f"Human IRR conversations: {df_irr['convo_id'].nunique()}  "
      f"rows: {len(df_irr)}")

print(f"LLM judge conversations: {df_llm['convo_id'].nunique()}  "
      f"rows: {len(df_llm)}")

Human IRR conversations: 57  rows: 1222
LLM judge conversations: 231  rows: 5604


In [6]:
def convo_distribution(ids, label):
    counts = convos.loc[ids, "Assessment ID"].value_counts().sort_index()
    pct    = counts / counts.sum() * 100
    print(f"\n{label} split — conversations per Assessment ID")
    print(pd.concat([counts.rename("convo_count"),
                     pct.round(1).rename("%")], axis=1))

convo_distribution(irr_ids,  "Human-IRR (20 %)")
convo_distribution(llm_ids, "LLM-coding (80 %)")


Human-IRR (20 %) split — conversations per Assessment ID
               convo_count     %
Assessment ID                   
2                       19  33.3
3                       19  33.3
4                       19  33.3

LLM-coding (80 %) split — conversations per Assessment ID
               convo_count     %
Assessment ID                   
2                       78  33.8
3                       78  33.8
4                       75  32.5


In [None]:
df_irr.head()

In [None]:
df_llm.head()

## Save

In [9]:
df_irr.to_csv(IRR_PATH, index=False)
df_llm.to_csv(JUDGE_PATH, index=False)