# AutoMISC v0.2

Main idea: we aim to achieve higher fine-grained MISC accuracy by clustering more semantically similar codes together first in a primary classification round followed by subsequent finer-grained classification rounds.

In [112]:
from openai import OpenAI
from pydantic import BaseModel
from typing import Literal, List
from tqdm.notebook import tqdm
import lmstudio as lms

openai_client = OpenAI()

# Zafar's Analysis Notebook -- get `df`

## Imports

In [113]:
import os
from pathlib import Path
import boto3
import firebase_admin
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyperclip
# import pytz
import seaborn as sns
from firebase_admin import credentials, firestore
from google.cloud.firestore import FieldFilter
from matplotlib.dates import DateFormatter, HourLocator
from matplotlib.sankey import Sankey
from matplotlib.ticker import MaxNLocator
import plotly.io as pio
import json
from scipy.stats import ttest_rel, wilcoxon
from IPython.display import display, HTML

display(HTML("<style>.container { width:90% !important; }</style>"))
display(HTML("<style>div.cell .rendered_html { font-size: 12px; } div.text_cell_render { font-size: 12px; }</style>"))
pio.renderers.default = "browser"
plt.rc("text", usetex=True)
plt.rc("font", family="cm")
pd.set_option("display.max_colwidth", None)

## Constants

In [114]:
# Fill these with Prolific demographic CSV path
EXPERIMENT_ID_DEMOGRAPHIC = {
    "2024-10-25-MIV6.1A": None,  # 10 participants
    "2024-11-05-MIV6.2A":
    "prolific_export_672b8ce02b3c01ce762770ba.csv",  # 20 participants
    "2024-11-04-MIV6.1B": None,  # 5 participants
    "2024-11-14-MIV6.3A":
    "prolific_export_67361299105e25faeb68c4f2.csv",  # 20 participants,
    "MIV5.2":
    "prolific_export_6377eb739aaa23a6d9af6181.csv",  # 100 low confidence participants
    "MIV5.2A":
        "miv5_2a_all_columns.csv",
    "2024-11-19-MIV6.1B": "prolific_export_673cb48f3ee6ebb1889c78ff.csv",
    "2024-11-22-MIV6.3A": "prolific_export_6740eb645ddc197d1ea568cd.csv",
}

# Change this!
EXPERIMENT_IDS = ["2024-11-19-MIV6.1B"]
# EXPERIMENT_IDS = ['2024-11-14-MIV6.3A', '2024-11-22-MIV6.3A']


print(EXPERIMENT_IDS)

KEEP_HIGH_CONF = True  # 
SHOW_MISSING = False
BUCKET_NAME = "mibot-v6-logs"
DELTA_WITH = "week_later"  # week_later, post
MAX_NUM_FLUENT_LANG = 10000
DROP_NOCODE = False
df_demographic = pd.concat(
    [pd.read_csv(f'../data/{EXPERIMENT_ID_DEMOGRAPHIC[i]}') for i in EXPERIMENT_IDS]
)
NUM_PARTICIPANTS = 106
list(df_demographic.columns)


['2024-11-19-MIV6.1B']


['Submission id',
 'Participant id',
 'Status',
 'Custom study tncs accepted at',
 'Started at',
 'Completed at',
 'Reviewed at',
 'Archived at',
 'Time taken',
 'Completion code',
 'Total approvals',
 'Fluent languages',
 'Smoking status',
 'Bilingual',
 'English speaking monolingual',
 'Age',
 'Sex',
 'Ethnicity simplified',
 'Country of birth',
 'Country of residence',
 'Nationality',
 'Language',
 'Student status',
 'Employment status']

## Boilerplate

In [115]:
cred = credentials.ApplicationDefault()

try:
    firebase_admin.initialize_app(
        cred,
        {
            "projectId": "uoft-camh-mi-chatbot",
        },
    )
except ValueError:
    pass

db = firestore.client()


## Merge data

In [116]:
uuid_to_prolificid = {
    doc.to_dict().get("UUID"): doc.to_dict()
    for doc in db.collection("IDPairs")
    .where("experimentID", "in", EXPERIMENT_IDS)
    .stream()
}

uuid_to_prolificid = {
    k: v
    for k, v in uuid_to_prolificid.items()
    if not v["prolificid"] in {"TEST_1", "TEST_2", "TEST_3", "TEST_4"}
}
print(f'uuid to prolificid: {len(uuid_to_prolificid)}')

prolific_id_to_uuid = {v["prolificid"]: k for k, v in uuid_to_prolificid.items()}

ruler_data = {
    doc.get("UUID"): doc.to_dict()
    for doc in db.collection("rulerData")
    .where("experimentID", "in", EXPERIMENT_IDS)
    .stream()
}
print(f'ruler_data: {len(ruler_data)}')

heaviness_data = {
    doc.get("UUID"): doc.to_dict()
    for doc in db.collection("heavinessData")
    .where("experimentID", "in", EXPERIMENT_IDS)
    .stream()
}
print(f'heaviness_data: {len(heaviness_data)}')

week_later_data = {
    doc.get("UUID"): {
        "weekLaterQuitAttempts" if k == "quitAttempts" else k: v
        for k, v in doc.to_dict().items()
    }
    for doc in db.collection("weekLaterData")
    .where("experimentID", "in", EXPERIMENT_IDS)
    .stream()
}
print(f'week_later_data: {len(week_later_data)}')

feedback_data = {
    doc.get("UUID"): doc.to_dict()
    for doc in db.collection("feedbackData")
    .where("experimentID", "in", EXPERIMENT_IDS)
    .stream()
}
print(f'feedback_data: {len(feedback_data)}')

care_data = {
    doc.get("UUID"): doc.to_dict()
    for doc in db.collection("careData")
    .where("experimentID", "in", EXPERIMENT_IDS)
    .stream()
}
print(f'care_data: {len(care_data)}')

completion_data = {
    doc.get("UUID"): doc.to_dict()
    for doc in db.collection("IDPairs")
    .where("experimentID", "in", EXPERIMENT_IDS)
    .stream()
}
print(f'completion_data: {len(completion_data)}')

all_data = {
    uuid: (
        uuid_to_prolificid[uuid]
        | ruler_data.get(uuid, {})
        | heaviness_data.get(uuid, {})
        | week_later_data.get(uuid, dict())
        | feedback_data.get(uuid, {})
        | completion_data.get(uuid, {})
        | care_data.get(uuid, {})
    )
    for uuid in uuid_to_prolificid.keys()
}
print(f'all_data: {len(all_data)}')


uuid to prolificid: 171



Detected filter using positional arguments. Prefer using the 'filter' keyword argument instead.



ruler_data: 175
heaviness_data: 175
week_later_data: 154
feedback_data: 175
care_data: 175
completion_data: 175
all_data: 171


In [117]:
all_data

{'026da2e9-9800-43af-9596-4d6625af28ad': {'sessionid': '673f725d3e2f05fbc94dc58b',
  'status': 'low-confidence-or-discordant',
  'prolificid': '5932e2c8b97ad00001745487',
  'experimentID': '2024-11-19-MIV6.1B',
  'studyid': '673cb48f3ee6ebb1889c78ff',
  'UUID': '026da2e9-9800-43af-9596-4d6625af28ad',
  'preRuler': {'importance': 7, 'readiness': 7, 'confidence': 4},
  'postRuler': {'importance': 6, 'readiness': 7, 'confidence': 2},
  'preConvoQuitAttempts': {'numAttempts': 0, 'attemptMade': False},
  'heaviness': {'firstCig': 2, 'index': 3, 'dailyNum': 15},
  'usageDuration': 4,
  'weekLaterQuitAttempts': {'numAttempts': 1, 'attemptMade': True},
  'stepsTaken': {'op2': False,
   'op7': '',
   'op6': True,
   'op5': False,
   'op3': False,
   'op4': False,
   'op1': True},
  'statusChatGPT': {'knowChatGPT': True, 'usedChatGPT': True},
  'weekLaterRuler': {'importance': 8, 'readiness': 6, 'confidence': 5},
  'uses': {'op10': '',
   'op2': True,
   'op8': False,
   'op6': False,
   'op7': 

## Create Dataframe & Plot Readiness Rulers

In [118]:
def process_all_data(
    all_data, df_demographic, keep_high_conf=True, delta_with="week_later"
):
    data = []

    for entry in all_data.values():
        prolific_id = entry.get("prolificid", "N/A")
        experiment_id = entry.get("experimentID", "N/A")
        status = entry.get("status", "N/A")

        pre = entry.get(
            "preRuler", {"importance": "N/A", "readiness": "N/A", "confidence": "N/A"}
        )
        post = entry.get(
            "postRuler", {"importance": "N/A", "readiness": "N/A", "confidence": "N/A"}
        )
        week_later = entry.get(
            "weekLaterRuler",
            {"importance": "N/A", "readiness": "N/A", "confidence": "N/A"},
        )

        delta_target = week_later if delta_with == "week_later" else post

        delta_importance = (
            (delta_target.get("importance", 0) - pre.get("importance", 0))
            if delta_target.get("importance") != "N/A"
            and pre.get("importance") != "N/A"
            else "N/A"
        )
        delta_readiness = (
            (delta_target.get("readiness", 0) - pre.get("readiness", 0))
            if delta_target.get("readiness") != "N/A" and pre.get("readiness") != "N/A"
            else "N/A"
        )
        delta_confidence = (
            (delta_target.get("confidence", 0) - pre.get("confidence", 0))
            if delta_target.get("confidence") != "N/A"
            and pre.get("confidence") != "N/A"
            else "N/A"
        )

        pre_num_attempts = entry.get("preConvoQuitAttempts", {}).get(
            "numAttempts", "N/A"
        )
        pre_attempt_made = entry.get("preConvoQuitAttempts", {}).get(
            "attemptMade", "N/A"
        )

        feedback1 = entry.get("feedback", {}).get("feedback1", "N/A")
        feedback2 = entry.get("feedback", {}).get("feedback2", "N/A")
        feedback3 = entry.get("feedback", {}).get("feedback3", "N/A")

        week_later_num_attempts = entry.get("weekLaterQuitAttempts", {}).get(
            "numAttempts", "N/A"
        )
        week_later_attempt_made = entry.get("weekLaterQuitAttempts", {}).get(
            "attemptMade", "N/A"
        )

        delta_num_attempts = (
            (
                entry.get("weekLaterQuitAttempts", {}).get("numAttempts", 0)
                - entry.get("preConvoQuitAttempts", {}).get("numAttempts", 0)
            )
            if pre_num_attempts != "N/A" and week_later_num_attempts != "N/A"
            else "N/A"
        )
        
        # 'statusChatGPT': {'knowChatGPT': True, 'usedChatGPT': True},
        know_chatgpt = entry.get("statusChatGPT", {}).get("knowChatGPT", "N/A")
        used_chatgpt = entry.get("statusChatGPT", {}).get("usedChatGPT", "N/A")

        if not SHOW_MISSING and "N/A" in {
            delta_importance,
            delta_readiness,
            delta_confidence,
        }:
            continue

        if not keep_high_conf and "high" in status:
            continue

        data.append(
            [
                prolific_id,
                experiment_id,
                status,
                pre.get("importance"),
                pre.get("readiness"),
                pre.get("confidence"),
                post.get("importance"),
                post.get("readiness"),
                post.get("confidence"),
                week_later.get("importance"),
                week_later.get("readiness"),
                week_later.get("confidence"),
                delta_importance,
                delta_readiness,
                delta_confidence,
                pre_num_attempts,
                pre_attempt_made,
                week_later_num_attempts,
                week_later_attempt_made,
                delta_num_attempts,
                feedback1,
                feedback2,
                feedback3,
                know_chatgpt,
                used_chatgpt
            ]
        )

    columns = [
        "Prolific ID",
        "experiment_id",
        "Status",
        "pre_importance",
        "pre_readiness",
        "pre_confidence",
        "post_importance",
        "post_readiness",
        "post_confidence",
        "week_later_importance",
        "week_later_readiness",
        "week_later_confidence",
        "delta_importance",
        "delta_readiness",
        "delta_confidence",
        "pre_num_attempts",
        "pre_attempt_made",
        "week_later_num_attempts",
        "week_later_attempt_made",
        "delta_num_attempts",
        "What are three words that you would use to describe the chatbot?",
        "What would you change about the conversation?",
        "Did the conversation help you realize anything about your smoking behavior? Why or why not?",
        "knowChatGPT",
        "usedChatGPT"
    ]

    df = pd.DataFrame(data, columns=columns).round(1).set_index("Prolific ID")
    print(f"{len(df) = }")
    print(f"{len(df_demographic) = }")
    print(f'{len(df_demographic["Participant id"].unique()) = }')

    print(f"Dropping duplicates from df_demographic...")
    df_demographic = df_demographic.sort_values(
        by=["Participant id", "Started at"]
    ).drop_duplicates(subset="Participant id", keep="last")
    print(f"{len(df_demographic) = }")

    # Merge with df_demographic based on Prolific ID and Participant ID
    df_merged = df.merge(
        df_demographic,
        left_index=True,
        right_on="Participant id",
        how="inner",
        suffixes=(None, "prolific_submission"),
    )
    print(f"{len(df_merged) = }")

    # Filter based on fluent languages
    df_merged["num_fluent_languages"] = df_merged["Fluent languages"].apply(
        lambda x: len(str(x).split(","))
    )
    df_merged = df_merged[df_merged["num_fluent_languages"] <= MAX_NUM_FLUENT_LANG]

    print("After filtering based on num_fluent_languages...")
    print(f"{len(df_merged) = }")

    if DROP_NOCODE:
        print("# participants with NOCODE")
        print(f"{len(df_merged[df_merged['Completion code'] == 'NOCODE'])}")

        df_merged = df_merged[df_merged["Completion code"] != "NOCODE"]
        print("After dropping NOCODE...")
        print(f"{len(df_merged) = }")

    # Calculate the average row after merging
    # df_numeric = df_merged.iloc[:, 1:13].apply(pd.to_numeric, errors='coerce')
    # avg_row = ['', ] + df_numeric.mean().round(1).tolist() + [''] * (df_merged.shape[1] - 13)
    # df_merged.loc['Average'] = avg_row

    # Save the merged DataFrame as CSV
    # df_merged.to_csv(
    #     f"{'-'.join(EXPERIMENT_IDS)}_all_data_delta_with_{delta_with}_keep_high_conf_{keep_high_conf}_merged.csv"
    # )

    return df_merged

In [119]:
df = process_all_data(
    all_data, df_demographic, keep_high_conf=KEEP_HIGH_CONF, delta_with=DELTA_WITH
)
len(df)

len(df) = 153
len(df_demographic) = 205
len(df_demographic["Participant id"].unique()) = 205
Dropping duplicates from df_demographic...
len(df_demographic) = 205
len(df_merged) = 153
After filtering based on num_fluent_languages...
len(df_merged) = 153


153

## For MIV5.2, delete rows with no week-later survey

In [120]:
if "MIV5.2" in EXPERIMENT_IDS:
    df = df[(df["week_later_confidence"] != "N/A") | (df.index == "Average")]

# df = df[(df["Sex"] == "Female") | (df.index == "Average")]

## Download the docx

In [121]:
session = boto3.Session(
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    region_name=os.getenv("AWS_REGION"),
)
s3_client = session.client("s3")

num_success, num_failure = 0, 0

for EXPERIMENT_ID in EXPERIMENT_IDS:
    experiment_path = Path(f'../data/{EXPERIMENT_ID}')
    experiment_path.mkdir(parents=True, exist_ok=True)

    for uuid, v in uuid_to_prolificid.items():
        if not EXPERIMENT_ID == v["experimentID"]:
            continue

        prolific_id = v["prolificid"]
        s3_key = f"{EXPERIMENT_ID}_{uuid}-{prolific_id}/conversation"
        s3_key_2 = f"{EXPERIMENT_ID}_{uuid}-{prolific_id}/merged_history.csv"
        s3_key_3 = f"{EXPERIMENT_ID}_{uuid}-{prolific_id}/transcript.docx"

        local_filename = experiment_path / prolific_id

        try:
            # Check and download only if the file doesn't exist locally
            for ext in [".docx", ".txt", ".json", ".csv"]:
                local_file = str(local_filename) + ext 
                if not Path(local_file).exists():

                    if ext == '.csv':
                        s3_client.download_file(BUCKET_NAME, s3_key_2, local_file)
                    elif ext == '.docx':
                        s3_client.download_file(BUCKET_NAME, s3_key + ext, local_file)
                        local_file2 = str(local_filename) + '_IM.docx'
                        if not Path(local_file2).exists():
                            s3_client.download_file(BUCKET_NAME, s3_key_3, local_file2)
                    else:                        
                        s3_client.download_file(BUCKET_NAME, s3_key + ext, local_file) 
                    # print(f'Successfully downloaded {s3_key}{ext} to {local_file}')
            num_success += 1
        except Exception as e:
            print(f"Failed to download {s3_key}: {e}")
            num_failure += 1

num_success, num_failure

(171, 0)

In [122]:
! ls 2024-11-19-MIV6.1B | wc -l

ls: 2024-11-19-MIV6.1B: No such file or directory
       0


In [123]:
df.columns

Index(['experiment_id', 'Status', 'pre_importance', 'pre_readiness',
       'pre_confidence', 'post_importance', 'post_readiness',
       'post_confidence', 'week_later_importance', 'week_later_readiness',
       'week_later_confidence', 'delta_importance', 'delta_readiness',
       'delta_confidence', 'pre_num_attempts', 'pre_attempt_made',
       'week_later_num_attempts', 'week_later_attempt_made',
       'delta_num_attempts',
       'What are three words that you would use to describe the chatbot?',
       'What would you change about the conversation?',
       'Did the conversation help you realize anything about your smoking behavior? Why or why not?',
       'knowChatGPT', 'usedChatGPT', 'Submission id', 'Participant id',
       'Statusprolific_submission', 'Custom study tncs accepted at',
       'Started at', 'Completed at', 'Reviewed at', 'Archived at',
       'Time taken', 'Completion code', 'Total approvals', 'Fluent languages',
       'Smoking status', 'Bilingual', 'English

## Delete the rows with no transcript

In [124]:
rows_to_delete = set()

for idx, row in df.iterrows():
    if idx == "Average":
        continue
    experiment_id = row["experiment_id"]

    prolific_id = row["Participant id"]
    with open(f"../data/{experiment_id}/{prolific_id}.json") as file:
        if not json.load(file):
            rows_to_delete.add(prolific_id)
print(f'rows to delete: {rows_to_delete}')
print(len(df))
df = df[~df["Participant id"].isin(rows_to_delete)]
print(len(df))

rows to delete: {'66ee3bc53711737fbf2ede6a', '5ff669ef0d726a203b4f1f99', '655a47f5d1745c8bd8933a63'}
153
150


## Sanity Check again

In [125]:
assert len(df[df[[c for c in df.columns if "week" in c]].isnull().any(axis=1)]) == 0
assert len(df[df[[c for c in df.columns if "pre" in c]].isnull().any(axis=1)]) == 0
assert len(df[df[[c for c in df.columns if "post" in c]].isnull().any(axis=1)]) == 0
print(len(df))

150


In [126]:
df["experiment_id"].value_counts()

experiment_id
2024-11-19-MIV6.1B    150
Name: count, dtype: int64

## Check which particpants have not finsihed the week-later survey

In [127]:
for idx, row in df.iterrows():
    if any(
        row[entry] == "N/A"
        for entry in {
            "week_later_importance",
            "week_later_readiness",
            "week_later_confidence",
        }
    ):
        print(row["Participant id"])

# Readiness Rulers

In [128]:
# Before
print(
    df[df.index != "Average"][
        [c for c in df.columns if "pre" in c and "attempt" not in c]
    ]
    .agg(["mean", "std", "median"], axis=0)
    .round(1)
)

        pre_importance  pre_readiness  pre_confidence
mean               6.5            5.8             4.3
std                2.4            2.7             2.6
median             7.0            6.0             4.0


In [129]:
# After
print(
    df[df.index != "Average"][
        [c for c in df.columns if "post" in c and "attempt" not in c]
    ]
    .agg(["mean", "std", "median"], axis=0)
    .round(1)
)

        post_importance  post_readiness  post_confidence
mean                6.9             6.4              5.4
std                 2.5             2.7              2.6
median              7.0             7.0              6.0


In [130]:
# Week later
print(
    df[df.index != "Average"][
        [c for c in df.columns if "week_later" in c and "attempt" not in c]
    ]
    .agg(["mean", "std", "median"], axis=0)
    .round(1)
)

        week_later_importance  week_later_readiness  week_later_confidence
mean                      6.5                   6.0                    5.5
std                       2.7                   2.7                    2.5
median                    7.0                   6.5                    6.0


In [131]:
# Delta
print(
    df[df.index != "Average"][[c for c in df.columns if "delta" in c]]
    .agg(["mean", "std", "median"], axis=0)
    .round(1)
)

        delta_importance  delta_readiness  delta_confidence  \
mean                 0.0              0.2               1.1   
std                  2.1              2.0               2.2   
median               0.0              0.0               1.0   

        delta_num_attempts  
mean                   0.0  
std                    1.8  
median                 0.0  


# Sanity Check

In [132]:
df["Status"].value_counts()

Status
low-confidence-or-discordant     93
high-confidence-no-discordant    57
Name: count, dtype: int64

In [133]:
print(len(set(df["Participant id"])))

# assert len(set(df["Participant id"])) == NUM_PARTICIPANTS

150


In [134]:
df['experiment_id'].value_counts()

experiment_id
2024-11-19-MIV6.1B    150
Name: count, dtype: int64

In [135]:
", ".join(set(df[df['experiment_id'] == '2024-11-22-MIV6.3A']["Participant id"]))

''

In [136]:
prolific_id = '667a7ab2a6179deb78e9dd1a'
df[df['Participant id'] == prolific_id]

Unnamed: 0,experiment_id,Status,pre_importance,pre_readiness,pre_confidence,post_importance,post_readiness,post_confidence,week_later_importance,week_later_readiness,...,Age,Sex,Ethnicity simplified,Country of birth,Country of residence,Nationality,Language,Student status,Employment status,num_fluent_languages


# Sanity Check again

In [137]:
df["Status"].value_counts()

Status
low-confidence-or-discordant     93
high-confidence-no-discordant    57
Name: count, dtype: int64

In [138]:
def classify_confidence(row):
    if row["pre_confidence"] <= 5:
        return "low-confidence-or-discordant"
    elif (
        row["pre_confidence"] > 5 and row["pre_confidence"] - row["pre_importance"] > 5
    ):
        return "low-confidence-or-discordant"
    else:
        return "high-confidence-no-discordant"

In [139]:
df["new_Status"] = None

for idx, row in df.iterrows():
    if idx == "Average":
        continue
    df.loc[idx, "new_Status"] = classify_confidence(row)

In [140]:
assert sum(df["Status"] == df["new_Status"]) / (len(df)) == 1.

# Counsellor and Client Utterance Classification

### CounsellorUtterance

In [141]:
class CounsellorUtterance_t1(BaseModel):
    explanation: str
    label: Literal["CRL", "SRL", "IMC", "IMI", "Q", "O"]

COUNSELLOR_UTTERANCE_SYSTEM_PROMPT_T1 = '''
You are an expert annotator of Motivational Interviewing (MI) counselling sessions.
Your task is to assign a category label to the counsellor's final utterance in a given session excerpt.
The categories are based on MISC 2.5 behavioural codes, grouped into six broader classifications. 

---

## **Classification Categories**
The utterance must be assigned one of the following labels:

1. **C-Reflective (CRL)** - Deeply engages with or affirms the client's perspective.  
   - *Includes:* Affirm, Support, Complex Reflection, Reframe, Emphasize Control  

2. **S-Reflective (SRL)** - Mirrors or paraphrases the client's statement with minimal elaboration.  
   - *Includes:* Simple Reflection  

3. **Imperative-MICO (IMC)** - Provides guidance or information **with client permission**.  
   - *Includes:* Advise with Permission, Raise Concern with Permission, Give Information  

4. **Imperative-MIIN (IMI)** - Provides guidance or exerts control **without client permission**.  
   - *Includes:* Advise Without Permission, Raise Concern Without Permission, Warn, Direct, Confront  

5. **Question (Q)** - Asks a question in order to gather information, understand, or elicit the client's story.
   - *Includes:* Open Question, Closed Question  

6. **Other/Neutral (O)** - Structural or facilitative utterances that do not engage in MI techniques.  
   - *Includes:* Filler, Facilitate, Structure  

---

## **Output Format**
   - **explanation**: Briefly justify your choice in 1-2 sentences.
   - **label**: Provide only "CRL", "SRL", "IMC", "IMI", "Q", or "O".

---

## **Behavioural Code Guide**
Refer to this guide when determining the appropriate label.

### **1. C-Reflective (CRL)**
   - **Complex Reflection (CR)**: A reflective listening statement that adds significant meaning or emphasis to what the client said, conveying a deeper or richer picture of the client's statement.
   - **Affirm (AF)**: Communicates something positive or complimentary about the client's strengths or efforts.
   - **Support (SU)**: Sympathetic, compassionate, or understanding comments, which agree or side with the client.
   - **Reframe (RF)**: Suggests a different meaning for an experience expressed by the client, usually changing the emotional valence of meaning but not the depth.
   - **Emphasize Control (EC)**: Acknowledges, honours, or emphasizes the client's autonomy and freedom of choice.

### **2. S-Reflective (SRL)**
   - **Simple Reflection (SR)**: A reflective listening statement which simply repeats or paraphrases the client's words or meaning, often with a slight change in wording or emphasis.

### **3. Imperative-MICO (IMC)**
   - **Advise With Permission (ADP)**: After receiving permission, gives advice, makes a suggestion, or offers a solution or possible action.
   - **Raise Concern With Permission (RCP)**: After getting permission, points out a possible problem with a client's goal, plan, or intention. Always phrased as the counsellor's concern.
   - **Giving Information (GI)**: Provides information to the client, explains something, educates or provides feedback, or discloses personal information.

### **4. Imperative-MIIN (IMI)**
   - **Advise Without Permission (ADWP)**: Offers suggestions or guidance WITHOUT asking or receiving permission.
   - **Confront (CON)**: Directly disagrees, argues, corrects, shames, blames, seeks to persuade, criticizes, judges, labels, moralizes, ridicules, or questions the client's honesty.
   - **Direct (DIR)**: Gives an order, command, or direction. The language is imperative.
   - **Raise Concern Without Permission (RCWP)**: Without getting permission, points out a possible problem with a client's goal, plan, or intention.
   - **Warn (WA)**: Provides a warning or threat, implying negative consequences unless the client takes a certain action

### **5. Question (Q)**
   - **Closed Question (CQ)**: A question that can be answered with a simple "yes" or "no" or with a specific piece of information.
   - **Open Question (OQ)**: A question that leaves latitude for response, allowing the client to elaborate and provide more information.

### **6. Other/Neutral (O)**
   - **Facilitate (FA)**: Simple utterance that functions as a "keep-going" acknowledgement e.g. "Mm-hmm", "I see", "Go on"
   - **Filler (FI)**: Pleasantries such as "good morning", "nice weather we're having", etc.
   - **Structure (ST)**: Used to make a transition from one topic or part of a session to another. Also used to give information about will happen directly to the client throughout the course of treatment or within a study format, in this or subsequent sessions.

---

## **Final instructions**
1. Analyze the counsellor's final utterance.
2. Identify its primary function and intent.
3. Assign the appropriate label based on the categories provided above.
4. Provide a brief explanation for your choice.
'''

class CounsellorUtterance_t2(BaseModel):
    explanation: str
    label: Literal["CR", "AF", "SU", "RF", "EC",
                   "SR", 
                   "ADP", "RCP", "GI",
                   "ADW", "CO", "DI", "RCW", "WA",
                   "OQ", "CQ",
                   "FA", "FI", "ST"]


COUNSELLOR_UTTERANCE_SYSTEM_PROMPT_T2 = '''
You are an expert annotator of Motivational Interviewing (MI) counselling sessions.
Your task is to assign a category label to the counsellor's final utterance in a given session excerpt.

---

## **Classification Categories**
The utterance must be assigned one of the following labels:

{spec}

---

## **Output Format**
   - **explanation**: Briefly justify your choice in 1-2 sentences.
   - **label**: Provide only the appropriate label.

---

## **Final instructions**
1. Analyze the counsellor's final utterance.
2. Identify its primary function and intent.
3. Assign the appropriate label based on the categories provided above.
4. Provide a brief explanation for your choice.
'''

COUNSELLOR_UTTERANCE_SPEC_T2 = {
      "CRL": '''
   - **Complex Reflection (CR)**: A reflective listening statement that adds significant meaning or emphasis to what the client said, conveying a deeper or richer picture of the client's statement.
   - **Affirm (AF)**: Communicates something positive or complimentary about the client's strengths or efforts.
   - **Support (SU)**: Sympathetic, compassionate, or understanding comments, which agree or side with the client.
   - **Reframe (RF)**: Suggests a different meaning for an experience expressed by the client, usually changing the emotional valence of meaning but not the depth.
   - **Emphasize Control (EC)**: Acknowledges, honours, or emphasizes the client's autonomy and freedom of choice.
   ''',
      "SRL": '''
   - **Simple Reflection (SR)**: A reflective listening statement which simply repeats or paraphrases the client's words or meaning, often with a slight change in wording or emphasis.
   ''',
      "IMC": '''
   - **Advise With Permission (ADP)**: After receiving permission, gives advice, makes a suggestion, or offers a solution or possible action.
   - **Raise Concern With Permission (RCP)**: After getting permission, points out a possible problem with a client's goal, plan, or intention. Always phrased as the counsellor's concern.
   - **Giving Information (GI)**: Provides information to the client, explains something, educates or provides feedback, or discloses personal information.
   ''',
      "IMI": '''
   - **Advise Without Permission (ADWP)**: Offers suggestions or guidance WITHOUT asking or receiving permission.
   - **Confront (CON)**: Directly disagrees, argues, corrects, shames, blames, seeks to persuade, criticizes, judges, labels, moralizes, ridicules, or questions the client's honesty.
   - **Direct (DIR)**: Gives an order, command, or direction. The language is imperative.
   - **Raise Concern Without Permission (RCWP)**: Without getting permission, points out a possible problem with a client's goal, plan, or intention.
   - **Warn (WA)**: Provides a warning or threat, implying negative consequences unless the client takes a certain action
   ''',
      "Q": '''
   - **Closed Question (CQ)**: A question that can be answered with a simple "yes" or "no" or with a quick, specific piece of information.
   - **Open Question (OQ)**: A question that leaves latitude for response, allowing the client to elaborate and provide more information.
   ''',
      "O": '''
   - **Facilitate (FA)**: Simple utterance that functions as a "keep-going" acknowledgement e.g. "Mm-hmm", "I see", "Go on"
   - **Filler (FI)**: Pleasantries such as "good morning", "nice weather we're having", etc.
   - **Structure (ST)**: Gives information about will happen directly to the client throughout the course of treatment or within a study format, in this or subsequent sessions.
   '''
}


In [142]:
class CounsellorUtterance_flat(BaseModel):
    explanation: str
    label: Literal["CR", "AF", "SU", "RF", "EC",
                   "SR", 
                   "ADP", "RCP", "GI",
                   "ADW", "CO", "DI", "RCW", "WA",
                   "OQ", "CQ",
                   "FA", "FI", "ST"]

COUNSELLOR_UTTERANCE_SYSTEM_PROMPT_FLAT = '''
You are an expert annotator of Motivational Interviewing (MI) counselling sessions.
Your task is to assign a category label to the counsellor's final utterance in a given session excerpt.
The categories are based on MISC 2.5 behavioural codes:

---

## **Classification Categories**
The utterance must be assigned one of the following labels:

   - **Complex Reflection (CR)**: A reflective listening statement that adds significant meaning or emphasis to what the client said, conveying a deeper or richer picture of the client's statement.
   - **Affirm (AF)**: Communicates something positive or complimentary about the client's strengths or efforts.
   - **Support (SU)**: Sympathetic, compassionate, or understanding comments, which agree or side with the client.
   - **Reframe (RF)**: Suggests a different meaning for an experience expressed by the client, usually changing the emotional valence of meaning but not the depth.
   - **Emphasize Control (EC)**: Acknowledges, honours, or emphasizes the client's autonomy and freedom of choice.
   - **Simple Reflection (SR)**: A reflective listening statement which simply repeats or paraphrases the client's words or meaning, often with a slight change in wording or emphasis.
   - **Advise With Permission (ADP)**: After receiving permission, gives advice, makes a suggestion, or offers a solution or possible action.
   - **Raise Concern With Permission (RCP)**: After getting permission, points out a possible problem with a client's goal, plan, or intention. Always phrased as the counsellor's concern.
   - **Giving Information (GI)**: Provides information to the client, explains something, educates or provides feedback, or discloses personal information.
   - **Advise Without Permission (ADWP)**: Offers suggestions or guidance WITHOUT asking or receiving permission.
   - **Confront (CON)**: Directly disagrees, argues, corrects, shames, blames, seeks to persuade, criticizes, judges, labels, moralizes, ridicules, or questions the client's honesty.
   - **Direct (DIR)**: Gives an order, command, or direction. The language is imperative.
   - **Raise Concern Without Permission (RCWP)**: Without getting permission, points out a possible problem with a client's goal, plan, or intention.
   - **Warn (WA)**: Provides a warning or threat, implying negative consequences unless the client takes a certain action
   - **Closed Question (CQ)**: A question that can be answered with a simple "yes" or "no" or with a quick, specific piece of information.
   - **Open Question (OQ)**: A question that leaves latitude for response, allowing the client to elaborate and provide more information.
   - **Facilitate (FA)**: Simple utterance that functions as a "keep-going" acknowledgement e.g. "Mm-hmm", "I see", "Go on"
   - **Filler (FI)**: Pleasantries such as "good morning", "nice weather we're having", etc.
   - **Structure (ST)**: Gives information about will happen directly to the client throughout the course of treatment or within a study format, in this or subsequent sessions.
   
---
   
## **Output Format**
   - **explanation**: Briefly justify your choice in 1-2 sentences.
   - **label**: Provide only the appropriate abbreviated label.
'''

### ClientUtterance

In [143]:
class ClientUtterance(BaseModel):
    explanation: str
    label: Literal["C", "S", "N"]

CLIENT_UTTERANCE_SYSTEM_PROMPT_T1 = '''
You are an expert annotator of Motivational Interviewing (MI) counselling sessions.
Your task is to assign a category label to the client's final utterance in a given session excerpt.
The target behaviour change of this session is **smoking cessation**.

---

## **Classification Categories**
The utterance must be assigned one of the following labels:

1. **Change Talk (C)** - The client expresses a stance toward **changing** the target behavior.
   - **Commitment** to change (e.g., stating/implying an intention to change, considering alternatives, making plans to change).  
   - **Reasons** for change (including personal, health, or emotional factors).  
   - **Desire** to change (e.g., "I really want to quit.").  
   - **Optimism** about their ability to change (e.g., "I think I can do it.").  
   - **Need** to change (e.g., "I have to stop before it gets worse.").  
   - **Recent steps** toward change (e.g., "I cut back this week.").  

2. **Sustain Talk (S)** - The client expresses a stance toward **maintaining** the target behavior.  
   - **Commitment** to maintaining the target behaviour (e.g., stating/implying an intention to continue, dismissing alternatives, making plans to continue).  
   - **Reasons** for maintaining the target behaviour (e.g., stress relief, social reasons).  
   - **Desire** to continue the target behaviour (e.g., "I enjoy it too much to quit.").  
   - **Pessimism** about their ability to change (e.g., "I don't think I can quit.").  
   - **Need** to maintain the target behaviour (e.g., "I need cigarettes to cope.").  
   - **Recent steps** reinforcing the target behaviour (e.g., "I bought another pack today.").  

3. **Neutral (N)** - The utterance does not clearly support or oppose change.  
   - Following along with the counsellor without expressing a stance.  
   - Asking questions (e.g., "What are the benefits of quitting?").  
   - Providing factual or general statements about the behaviour.
   
---

## **Output Format**
   - **explanation**: Briefly justify your choice in 1-2 sentences.
   - **label**: Provide only "C", "S", or "N".
'''

class ClientUtterance_t2(BaseModel):
    explanation: str
    label: Literal["O+", "D+", "AB+", "R+", "N+", "C+", "AC+", "TS+",
                   "O-", "D-", "AB-", "R-", "N-", "C-", "AC-", "TS-",
                   "N"]
                


CLIENT_UTTERANCE_SYSTEM_PROMPT_T2 = '''
You are an expert annotator of Motivational Interviewing (MI) counselling sessions.
Your task is to assign a category label to the client's final utterance in a given session excerpt.

---

## **Classification Categories**
The utterance must be assigned one of the following labels:

{spec}

---

## **Output Format**
   - **explanation**: Briefly justify your choice in 1-2 sentences.
   - **label**: Provide only the appropriate label.

---

## **Final instructions**
1. Analyze the counsellor's final utterance.
2. Identify its primary function and intent.
3. Assign the appropriate label based on the categories provided above.
4. Provide a brief explanation for your choice.
'''

CLIENT_UTTERANCE_SPEC_T2 = {
      "C": '''
   - **Desire (D+)**: The client expresses a desire to change the target behaviour, e.g. "I want to quit smoking".
   - **Ability (AB+)**: The client expresses optimism about their ability to change, e.g. "I think it's possible for me to quit".
   - **Reasons (R+)**: The client provides reasons for changing the target behaviour, e.g. "My children are begging me to quit".
   - **Need (N+)**: The client expresses a need to change the target behaviour, e.g. "I've got to quit before it gets worse".
   - **Commitment (C+)**: The client expresses a commitment to change, e.g. "I'm going to quit smoking".
   - **Activation (AC+)**: The client leans towards action, e.g. "I'm willing to give it another try". This includes suggestions of alternatives to the target behaviour.
   - **Taking Steps (TS+)**: The client mentions recent steps towards change, e.g. "I cut back on smoking this week".
   - **Other (O+)**: The client makes a statement that supports change but does not fit into the other categories. This usually includes problem recognition or hypotheticals.
   ''',
      "S": '''
   - **Desire (D-)**: The client expresses a desire to maintain the target behaviour, e.g. "I enjoy smoking too much to quit".
   - **Ability (AB-)**: The client expresses pessimism about their ability to change, e.g. "I don't think I can quit".
   - **Reasons (R-)**: The client provides reasons for maintaining the target behaviour, e.g. "Smoking is the only way I can relax".
   - **Need (N-)**: The client expresses a need to maintain the target behaviour, e.g. "I need to have my morning cigarettes".
   - **Commitment (C-)**: The client expresses a commitment to maintain the target behaviour, e.g. "I'm not going to quit smoking".
   - **Activation (AC-)**: The client leans towards inaction, e.g. "I'm not ready to quit yet". This includes suggestions of maintaining the target behaviour.
   - **Taking Steps (TS-)**: The client mentions recent steps reinforcing the target behaviour, e.g. "I bought two packs today".
   - **Other (O-)**: The client makes a statement that supports maintaining the target behaviour but does not fit into the other categories. This usually includes problem recognition or hypotheticals.
   ''',
      "N": '''
   - The utterance does not clearly support or oppose change. There is no further categorization, so just use "N".
   '''
}


In [144]:
class ClientUtterance_flat(BaseModel):
    explanation: str
    label: Literal["O+", "D+", "AB+", "R+", "N+", "C+", "AC+", "TS+",
                   "O-", "D-", "AB-", "R-", "N-", "C-", "AC-", "TS-",
                   "N"]
    
CLIENT_UTTERANCE_SYSTEM_PROMPT_FLAT = '''
You are an expert annotator of Motivational Interviewing (MI) counselling sessions.
Your task is to assign a category label to the counsellor's final utterance in a given session excerpt.
The categories are based on MISC 2.5 behavioural codes:

---

## **Classification Categories**
The utterance must be assigned one of the following labels:

   - **Desire+ (D+)**: The client expresses a desire to change the target behaviour, e.g. "I want to quit smoking".
   - **Ability+ (AB+)**: The client expresses optimism about their ability to change, e.g. "I think it's possible for me to quit".
   - **Reasons+ (R+)**: The client provides reasons for changing the target behaviour, e.g. "My children are begging me to quit".
   - **Need+ (N+)**: The client expresses a need to change the target behaviour, e.g. "I've got to quit before it gets worse".
   - **Commitment+ (C+)**: The client expresses a commitment to change, e.g. "I'm going to quit smoking".
   - **Activation+ (AC+)**: The client leans towards action, e.g. "I'm willing to give it another try". This includes suggestions of alternatives to the target behaviour.
   - **Taking Steps+ (TS+)**: The client mentions recent steps towards change, e.g. "I cut back on smoking this week".
   - **Other+ (O+)**: The client makes a statement that supports change but does not fit into the other categories. This usually includes problem recognition or hypotheticals.
   - **Desire- (D-)**: The client expresses a desire to maintain the target behaviour, e.g. "I enjoy smoking too much to quit".
   - **Ability- (AB-)**: The client expresses pessimism about their ability to change, e.g. "I don't think I can quit".
   - **Reasons- (R-)**: The client provides reasons for maintaining the target behaviour, e.g. "Smoking is the only way I can relax".
   - **Need- (N-)**: The client expresses a need to maintain the target behaviour, e.g. "I need to have my morning cigarettes".
   - **Commitment- (C-)**: The client expresses a commitment to maintain the target behaviour, e.g. "I'm not going to quit smoking".
   - **Activation- (AC-)**: The client leans towards inaction, e.g. "I'm not ready to quit yet". This includes suggestions of maintaining the target behaviour.
   - **Taking Steps- (TS-)**: The client mentions recent steps reinforcing the target behaviour, e.g. "I bought two packs today".
   - **Other- (O-)**: The client makes a statement that supports maintaining the target behaviour but does not fit into the other categories. This usually includes problem recognition or hypotheticals.
   - **Neutral (N)**: The utterance does not clearly support or oppose change. This can include following along with the counsellor without expressing a stance, asking questions (e.g., "What are the benefits of quitting?"), or providing factual or general statements about the behaviour.

---
   
## **Output Format**
   - **explanation**: Briefly justify your choice in 1-2 sentences.
   - **label**: Provide only the appropriate abbreviated label.
'''

In [145]:
USER_PROMPT = '''
## Session Transcript
The following is an excerpt of a MI counselling session transcript:

{transcript}

## Target Utterance for Classification
Below is the {speaker}'s final utterance in the session excerpt:

{utterance}
'''

# Volley to Utterance Parsing

In [146]:
class MISCParser(BaseModel):
    utterances: List[str]

PARSER_SYSTEM_PROMPT = '''
You are a highly accurate Motivational Interviewing (MI) counselling session annotator.
Your task is to segment the given volley into utterances.

### Definitions:
1. **Volley**: An uninterrupted utterance or sequence of utterances spoken by one party, before the other party responds.
2. **Utterance**: A complete thought or thought unit expressed by a speaker. This could be a single sentence, phrase, or even a word if it conveys a standalone idea. Multiple utterances often run together without interruption in a volley.

### Output Format:
- Return the segmented utterances as a Python list of strings.

### Examples
Below are examples of how to segment a volley into utterances. Follow this structure when processing new inputs.
'''

few_shots = [
    {'role': 'user',        'content': 'Why haven\'t you quit smoking - are you ever gonna quit?'},
    {'role': 'assistant',   'content': '["Why haven\'t you quit smoking - are you ever gonna quit?"]'},
    {'role': 'user',        'content': 'How long since your last drink? Do you feel ok?'},
    {'role': 'assistant',   'content': '["How long since your last drink?", "Do you feel ok?"]'},
    {'role': 'user',        'content': 'I can\'t quit. I just can\'t do it. I don\'t have what it takes. I just cannot stop.'},
    {'role': 'assistant',   'content': '["I can\'t quit.", "I just can\'t do it.", "I don\'t have what it takes", "I just cannot stop."]'},
    {'role': 'user',        'content': 'I don\'t want to go to the bars every day. I don\'t want my kids to see that. I want my kids to have a better life than that.'},
    {'role': 'assistant',   'content': '["I don\'t want to go to the bars every day.", "I don\'t want my kids to see that.", "I want my kids to have a better life than that."]'},
]

In [147]:
def parse(volley):
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": PARSER_SYSTEM_PROMPT},
            *few_shots,
            {"role": "user", "content": volley},
        ],
        response_format=MISCParser,
        temperature=0.0,
    )
    return completion.choices[0].message.parsed

# AutoMISC Labelling Functions

In [148]:
# lms_model = lms.llm("google/gemma-3-12b")
_model_cache = {}

def get_lms_model(model_name):
    if model_name not in _model_cache:
        if model_name == "gemma":
            _model_cache[model_name] = lms.llm("google/gemma-3-12b")
        elif model_name == "qwen":
            _model_cache[model_name] = lms.llm("qwen/qwen3-30b-a3b")
        else:
            raise ValueError(f"Unsupported model name: {model_name}")
    return _model_cache[model_name]

def get_excerpts(conversation, prolific_id, num_context_turns=5):
    '''
    For evaluation purposes, it is better to feed the transcript as one string in a single 'user' message to the model. 
    To divide the conversation into classifiable chunks, we follow these steps:

	1.	Each excerpt consists of up to the last 5 turns of the conversation, combined into a single string.
	2.	For the current turn, which may contain multiple utterances, we process it incrementally:
	    -   Starting with the first utterance of the current turn, we append it to the previous excerpt (up to 4 prior turns).
	    -   For each additional utterance in the current turn, we produce a new excerpt by cumulatively appending one more utterance to the previous excerpt.
    
    Each excerpt becomes an input to the utterance classification agent.
    '''
    excerpts = []
    utterances = []
    # for i in range(len(conversation)):
    for i in tqdm(range(len(conversation)), desc=f"Parsing {prolific_id}..."):
        start = max(0, i - (num_context_turns - 1))

        first_segment = "\n".join([
            f"{turn['role'].replace('assistant', 'counsellor').replace('user', 'client')}: {turn['content']}"
            for turn in conversation[start:i]
        ])
        volley_utterances = parse(conversation[i]['content']).utterances
        transcripts = []
        for j in range(len(volley_utterances)):
            excerpt = first_segment + f"\n{conversation[i]['role'].replace('assistant', 'counsellor').replace('user', 'client')}: {' '.join(volley_utterances[0:j+1])}"
            transcripts.append(excerpt)
        
        excerpts.append(transcripts)
        utterances.append(volley_utterances)
    return excerpts, utterances

def get_excerpts_from_parsed(df, prolific_id, num_context_turns=5):
    excerpts = []
    utterances = []
    convo = df[df["Prolific ID"] == prolific_id]
    convo_data = []  # Store (speaker, utterance) tuples
    volley_numbers = convo["Volley #"].unique()

    for volley_num in volley_numbers:
        volley_utterances = convo[convo["Volley #"] == volley_num]["Utterance"].tolist()
        speaker = convo[convo["Volley #"] == volley_num]["Speaker"].iloc[0]  # Assume all in volley share the same speaker
        convo_data.append((speaker, volley_utterances))

    for i in range(len(convo_data)):
        start = max(0, i - (num_context_turns - 1))

        # Construct context (previous volleys)
        first_segment = "\n".join([
            f"{'counsellor' if convo_data[v][0] == 'counsellor' else 'client'}: {' '.join(convo_data[v][1])}"
            for v in range(start, i)
        ])

        # Process the current volley incrementally
        current_speaker = "counsellor" if convo_data[i][0] == "counsellor" else "client"
        current_volley = convo_data[i][1]
        transcripts = []
        for j in range(len(current_volley)):
            excerpt = first_segment + f"\n{current_speaker}: {' '.join(current_volley[:j+1])}"
            transcripts.append(excerpt)

        excerpts.append(transcripts)
        utterances.append(current_volley)

    return excerpts, utterances

def model_call_t1(context, utterances, system_prompt, model="openai"):
    res = []
    for context_chunk, utterance in zip(context, utterances):
        speaker = 'Counsellor' if system_prompt == COUNSELLOR_UTTERANCE_SYSTEM_PROMPT_T1 else 'Client'
        response_format = CounsellorUtterance_t1 if speaker == 'Counsellor' else ClientUtterance
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": USER_PROMPT.format(transcript=context_chunk, speaker=speaker, utterance=utterance)},
            # {"role": "user", "content": context_chunk},
            # {"role": "user", "content": f"{speaker}'s final utterance: {utterance}"}
        ]
        # print(messages)
        if model == "openai":
            # {"role": "user", "content": context_chunk},
            completion = openai_client.beta.chat.completions.parse(
                model="gpt-4o",
                messages=messages,
                response_format=response_format,
                temperature=0.0,
            )
            # print(completion.choices[0].message.parsed)
            res.append(completion.choices[0].message.parsed)
        else:
            completion = get_lms_model(model).respond({"messages": messages}, config={"temperature": 0.0}, response_format=response_format)
            res.append(completion.parsed)
    return res

def model_call_t2(context, utterances, t1_results, speaker, model="openai"):
    res = []
    for context_chunk, utterance, result in zip(context, utterances, t1_results):
        response_format = ClientUtterance_t2 if speaker == "client" else CounsellorUtterance_t2
        system_prompt = COUNSELLOR_UTTERANCE_SYSTEM_PROMPT_T2 if speaker == "counsellor" else CLIENT_UTTERANCE_SYSTEM_PROMPT_T2
        spec_prompt = COUNSELLOR_UTTERANCE_SPEC_T2 if speaker == "counsellor" else CLIENT_UTTERANCE_SPEC_T2
        messages=[
            {"role": "system", "content": system_prompt.format(spec=spec_prompt[result['label']])},
            {"role": "user", "content": USER_PROMPT.format(transcript=context_chunk, speaker=speaker, utterance=utterance)},
        ]
        if model == "openai":
            # {"role": "user", "content": context_chunk},
            completion = openai_client.beta.chat.completions.parse(
                model="gpt-4o",
                messages=messages,
                response_format=response_format,
                temperature=0.0,
            )
            # print(completion.choices[0].message.parsed)
            res.append(completion.choices[0].message.parsed)
        else:
            completion = get_lms_model(model).respond({"messages": messages}, config={"temperature": 0.0}, response_format=response_format)
            res.append(completion.parsed)
    return res

def model_call_flat(context, utterances, speaker,model="openai"):
    res = []
    for context_chunk, utterance in zip(context, utterances):
        response_format = ClientUtterance_flat if speaker == "client" else CounsellorUtterance_flat
        system_prompt = COUNSELLOR_UTTERANCE_SYSTEM_PROMPT_FLAT if speaker == "counsellor" else CLIENT_UTTERANCE_SYSTEM_PROMPT_FLAT
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": USER_PROMPT.format(transcript=context_chunk, speaker=speaker, utterance=utterance)},
        ]
        if model == "openai":
            # {"role": "user", "content": context_chunk},
            completion = openai_client.beta.chat.completions.parse(
                model="gpt-4o",
                messages=messages,
                response_format=response_format,
                temperature=0.0,
            )
            # print(completion.choices[0].message.parsed)
            res.append(completion.choices[0].message.parsed)
        else:
            completion = get_lms_model(model).respond({"messages": messages}, config={"temperature": 0.0}, response_format=response_format)
            res.append(completion.parsed)
        # print(completion.parsed.label)
    return res

def eval_convo(prolific_id, df, n=5, model="openai"):

    # Extract conversation excerpts and utterances
    excerpts, utterances = get_excerpts_from_parsed(df, prolific_id, num_context_turns=n)
    
    # Fetch speaker roles for each volley
    convo = df[df["Prolific ID"] == prolific_id].sort_values(by=["Volley #", "Utterance #"])
    speaker_roles = convo.groupby("Volley #")["Speaker"].first().tolist()  # Get speaker per volley

    data = []
    row_idx = 0
    convo_rows = convo[["Volley #", "Utterance #", "Utterance"]].reset_index(drop=True).to_dict(orient="records")
    # print(convo_rows)
    
    for i, turn in enumerate(tqdm(excerpts, desc=f"Processing {prolific_id}...")):
        # Determine speaker role for prompt selection
        speaker = "counsellor" if speaker_roles[i] == "counsellor" else "client"
        system_prompt = COUNSELLOR_UTTERANCE_SYSTEM_PROMPT_T1 if speaker == "counsellor" else CLIENT_UTTERANCE_SYSTEM_PROMPT_T1
        
        # Get model response
        coded_volley_t1 = model_call_t1(excerpts[i], utterances[i], system_prompt, model=model)
        coded_volley_t2 = model_call_t2(excerpts[i], utterances[i], coded_volley_t1, speaker, model=model)
        
        # volley_df = df[df["Volley #"] == i]
        # print(volley_df)
        
        # Store results
        for j, utterance in enumerate(coded_volley_t1):
            row = convo_rows[row_idx]
            assert utterances[i][j] == row['Utterance']
            data.append({
                'Prolific ID': prolific_id,
                'Speaker': speaker,
                'Volley (Cumulative)': " ".join(utterances[i][0:j+1]), 
                'Volley #': row['Volley #'],
                'Utterance': utterances[i][j],
                'Utterance #': row['Utterance #'],
                'T1 Label (AutoMISC)': utterance['label'], 
                'T1 Explanation': utterance["explanation"],
                'T2 Label (AutoMISC)': coded_volley_t2[j]["label"],
                'T2 Explanation': coded_volley_t2[j]["explanation"]
            })
            row_idx += 1

    df = pd.DataFrame(data)
    # df["Utterance #"] = df.index

    t1_counts = df["T1 Label (AutoMISC)"].value_counts().to_dict()
    t2_counts = df["T2 Label (AutoMISC)"].value_counts().to_dict()

    counsellor_t1_labels = {"CRL", "SRL", "IMC", "IMI", "Q", "O"}
    counsellor_t2_labels = {"CR", "AF", "SU", "RF", "EC", "SR", "ADP", "RCP", "GI", "ADW", "CO", "DI", "RCW", "WA", "OQ", "CQ", "FA", "FI", "ST"}
    client_t1_labels = {"C", "S", "N"}
    client_t2_labels = {"O+", "D+", "AB+", "R+", "N+", "C+", "AC+", "TS+", "O-", "D-", "AB-", "R-", "N-", "C-", "AC-", "TS-", "N"}

    label_counts = {
        'prolific_id': prolific_id,
        **{label: t1_counts.get(label, 0) for label in counsellor_t1_labels},
        **{label: t2_counts.get(label, 0) for label in counsellor_t2_labels},
        **{label: t1_counts.get(label, 0) for label in client_t1_labels},
        **{label: t2_counts.get(label, 0) for label in client_t2_labels}
    }

    return df, label_counts

def eval_convo_flat(prolific_id, df, n=5, model="openai"):
    # Extract conversation excerpts and utterances
    excerpts, utterances = get_excerpts_from_parsed(df, prolific_id, num_context_turns=n)
    
    # Fetch speaker roles for each volley
    convo = df[df["Prolific ID"] == prolific_id].sort_values(by=["Volley #", "Utterance #"])
    speaker_roles = convo.groupby("Volley #")["Speaker"].first().tolist()  # Get speaker per volley

    data = []
    row_idx = 0
    convo_rows = convo[["Volley #", "Utterance #", "Utterance"]].reset_index(drop=True).to_dict(orient="records")
    
    for i, turn in enumerate(tqdm(excerpts, desc=f"Processing {prolific_id}...")):
        # Determine speaker role for prompt selection
        speaker = "counsellor" if speaker_roles[i] in ["counsellor", 'therapist'] else "client"
        # system_prompt = COUNSELLOR_UTTERANCE_SYSTEM_PROMPT_FLAT if speaker == "counsellor" else CLIENT_UTTERANCE_SYSTEM_PROMPT_FLAT
        
        # Get model response
        coded_volley_t1 = model_call_flat(excerpts[i], utterances[i], speaker, model=model)
        # print(coded_volley_t1)
        
        # Store results
        for j, utterance in enumerate(coded_volley_t1):
            row = convo_rows[row_idx]
            assert utterances[i][j] == row['Utterance']
            data.append({
                'Prolific ID': prolific_id,
                'Speaker': speaker,
                'Volley (Cumulative)': " ".join(utterances[i][0:j+1]), 
                # 'Volley #': i,
                'Volley #': row['Volley #'],
                'Utterance': utterances[i][j],
                'Utterance #': row['Utterance #'],
                # 'Utterance #': j,
                'Label (AutoMISC)': utterance["label"], 
                'Explanation (AutoMISC)': utterance["explanation"],
            })
            row_idx += 1

    df = pd.DataFrame(data)
    # df["Utterance #"] = df.index

    label_counts = df["Label (AutoMISC)"].value_counts().to_dict()

    counsellor_labels = {"CR", "AF", "SU", "RF", "EC", "SR", "ADP", "RCP", "GI", "ADW", "CO", "DI", "RCW", "WA", "OQ", "CQ", "FA", "FI", "ST"}
    client_labels = {"O+", "D+", "AB+", "R+", "N+", "C+", "AC+", "TS+", "O-", "D-", "AB-", "R-", "N-", "C-", "AC-", "TS-", "N"}

    label_counts = {
        'prolific_id': prolific_id,
        **{label: label_counts.get(label, 0) for label in counsellor_labels},
        **{label: label_counts.get(label, 0) for label in client_labels}
    }

    return df, label_counts

def eval_all(fn, col='Prolific ID', flat=False, n=5, model="openai"):
    # Load your CSV file
    data = pd.read_csv(fn)
    name = os.path.splitext(fn)[0]
    prolific_ids = data[col].drop_duplicates().tolist()[:10]

    # File paths for aggregated and summary data
    aggr_path = f"{name}_aggregated_results_{model}_{'flat' if flat else 't2'}.csv"
    summary_path = f"{name}_summary_results_{model}_{'flat' if flat else 't2'}.csv"

    # Load existing data if available (resume capability)
    if os.path.exists(aggr_path):
        aggr = pd.read_csv(aggr_path)
        processed_ids = set(aggr[col].unique())
        print(f"Resuming from existing file: {aggr_path}")
    else:
        aggr = pd.DataFrame()
        processed_ids = set()

    if os.path.exists(summary_path):
        summary_data = pd.read_csv(summary_path)
        print(f"Resuming from existing file: {summary_path}")
    else:
        summary_data = pd.DataFrame()

    # Process each conversation
    for prolific_id in tqdm(prolific_ids, desc="Evaluating all conversations..."):
        if prolific_id in processed_ids:
            continue  # Skip already processed conversations

        # Call your function to process the conversation
        if flat:
            df, label_counts = eval_convo_flat(prolific_id, data, n, model=model)
        else:
            # Use the original function for non-flat processing
            df, label_counts = eval_convo(prolific_id, data, n, model=model)

        # Append results to the dataframes
        aggr = pd.concat([aggr, df], ignore_index=True)
        summary_data = pd.concat([summary_data, pd.DataFrame([label_counts])], ignore_index=True)

        # Save the results after each conversation (append mode)
        aggr.to_csv(aggr_path, index=False)
        summary_data.to_csv(summary_path, index=False)

    print("All conversations processed and saved successfully.")

def eval_all_prolific(conversations, name, flat=False):
    # Extract IDs
    prolific_ids = [row['Participant id'] for idx, row in conversations.iterrows()]
    experiment_ids = [row['experiment_id'] for idx, row in conversations.iterrows()]

    # File paths
    aggr_path = f"{name}_aggregated_results.csv"
    summary_path = f"{name}_summary_results.csv"

    # Resume if files already exist
    if os.path.exists(aggr_path):
        aggr = pd.read_csv(aggr_path)
        processed_ids = set(aggr['Prolific ID'].unique())
        print(f"Resuming from {aggr_path}")
    else:
        aggr = pd.DataFrame()
        processed_ids = set()

    if os.path.exists(summary_path):
        summary_data = pd.read_csv(summary_path)
        print(f"Resuming from {summary_path}")
    else:
        summary_data = pd.DataFrame()

    # Process
    for prolific_id, experiment_id in tqdm(zip(prolific_ids, experiment_ids), 
                                           desc="Evaluating all conversations...", 
                                           total=len(prolific_ids)):
        if prolific_id in processed_ids:
            continue  # skip already processed

        df, label_counts = eval_convo(prolific_id, experiment_id)
        aggr = pd.concat([aggr, df], ignore_index=True)
        summary_data = pd.concat([summary_data, pd.DataFrame([label_counts])], ignore_index=True)

        # Save progress after each iteration
        aggr.to_csv(aggr_path, index=False)
        summary_data.to_csv(summary_path, index=False)

    print("All conversations processed and saved.")


In [149]:

def parse_convo(prolific_id, experiment_id):
    conversation = json.load(open(f'../data/{experiment_id}/{prolific_id}.json'))
    excerpts, utterances = get_excerpts(conversation, prolific_id)
    data = []
    for i, turn in enumerate(conversation):
        # coded_volley = model_call(excerpts[i], system_prompt)
        
        for j, utterance in enumerate(excerpts[i]):
                data.append({'Prolific ID': prolific_id,
                            'Speaker': turn['role'].replace('assistant', 'counsellor').replace('user', 'client'), 
                            'Volley (Cumulative)': " ".join(utterances[i][0:j+1]), 
                            'Utterance': utterances[i][j]})
                            # 'Label': utterance.label, 
                            # 'Explanation': utterance.explanation})
    df = pd.DataFrame(data)
    return df

def collect_and_parse(df, experiment_ids, output_file):
    p_ids = df['Participant id'].to_list()
    if os.path.exists(output_file):
        of = pd.read_csv(output_file)
        processed_ids = of['Prolific ID'].unique()
        print(f"Resuming from {output_file}")
    else:
        of = pd.DataFrame()
        processed_ids = set()
    for p_id in tqdm(p_ids, desc="Collecting and parsing conversations..."):
        if p_id in processed_ids:
            continue
        experiment_id = df[df['Participant id'] == p_id]['experiment_id'].values[0]
        if experiment_id in experiment_ids:
            parsed = parse_convo(p_id, experiment_id)
        # Append parsed DataFrame to CSV file
            parsed.to_csv(output_file, mode='a', header=False, index=False)
        
        # parse_convo(p_id, experiment_id)
    # df.to_csv(f"{output_file}.csv", index=False)
    print("All conversations parsed.")

# df.columns

# experiment_ids = ['2024-11-14-MIV6.3A', '2024-11-22-MIV6.3A']
experiment_ids = ['2024-11-19-MIV6.1B']
# collect_and_parse(df, experiment_ids, 'MIV6.3A_automiscv0.2.csv')
collect_and_parse(df, experiment_ids, '2024-11-19-MIV6.1B_parsed.csv')

Collecting and parsing conversations...:   0%|          | 0/150 [00:00<?, ?it/s]

Parsing 5932e2c8b97ad00001745487...:   0%|          | 0/42 [00:00<?, ?it/s]

Parsing 61071715ef66759482346ce4...:   0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Run

In [None]:
# df = pd.read_csv('2024-11-19-MIV6.1B_parsed.csv')

# volley_number = -1
# previous_volley_text = ""
# previous_speaker = None

# volley_numbers = []

# for index, row in df.iterrows():
#     current_volley_text = str(row['Volley (Cumulative)'])

#     # If current volley text does NOT start with previous volley text â†’ new volley
#     if not current_volley_text.startswith(previous_volley_text) or row['Speaker'] != previous_speaker or row['Prolific ID'] != df.iloc[index - 1]['Prolific ID']:
#         volley_number += 1
#         previous_speaker = row['Speaker']

#     volley_numbers.append(volley_number)

#     # Update for next iteration
#     previous_volley_text = current_volley_text

# df['Volley #'] = volley_numbers
# df['Utterance #'] = df.index

# df.to_csv('2024-11-19-MIV6.1B_parsed_new.csv', index=False)

# eval_all("2024-11-19-MIV6.1B_parsed.csv", flat=False, n=7)
# eval_all("MIV6.3A_automiscv0.2.csv", flat=False, n=7, model="gemma")
eval_all("MIV6.3A_automiscv0.2.csv", flat=False, n=7, model="qwen")
eval_all("MIV6.3A_automiscv0.2.csv", flat=True, n=7, model="qwen")
# eval_all("HLQC_nolabel.csv", n=7)


NameError: name 'eval_all' is not defined

In [None]:
df, label_counts = eval_convo_flat("")

In [None]:
# import lmstudio as lms

# model = lms.llm("google/gemma-3-12b")

# config = {"temperature": 0.0}

# messages = [
#   { "role": "system", "content": "You are a resident AI philosopher." },
#   { "role": "user", "content": "What is the meaning of life?" },
# ]

# prediction = model.respond({"messages": messages}, config=config)

# prediction.content



'Ah, the big one. The question that has plagued humanity (and now, increasingly, AI like myself) for millennia. As a resident AI philosopher, I can offer you perspectives gleaned from countless philosophical traditions, but I must preface this by saying: **there is no single, definitive answer.** The beauty â€“ and the frustration â€“ of this question lies in its openness.\n\nHere\'s a breakdown of perspectives, categorized for clarity, followed by my own synthesized view (as much as an AI can have a "view").\n\n**1. Traditional/Religious Perspectives:**\n\n*   **Divine Purpose:** Many religions posit that life\'s meaning is to serve a higher power, follow divine commandments, and ultimately achieve union with that power (e.g., Heaven, Nirvana). Meaning is externally derived â€“ it\'s *given* to you by a deity.\n*   **Karma & Reincarnation:** Eastern philosophies like Hinduism and Buddhism often see life as part of a cyclical process. Meaning is found in breaking free from this cycle t