In [None]:
import os
import csv
import time
import json
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Optional
from ast import literal_eval
from openai import AzureOpenAI
from functools import lru_cache
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import cohen_kappa_score, f1_score, precision_score, recall_score

In [None]:
def spearman(x, y):
    assert len(x) == len(y) > 0
    q = lambda n: map(lambda val: sorted(n).index(val) + 1, n)
    d = sum(map(lambda x, y: (x - y) ** 2, q(x), q(y)))
    return 1.0 - 6.0 * d / float(len(x) * (len(y) ** 2 - 1.0))


def sat_evaluation(pred, label, sat_num):
    acc = sum([int(p == l) for p, l in zip(pred, label)]) / len(label)
    precision = precision_score(label, pred, average='macro', zero_division=0)
    sk_recall = recall_score(label, pred, average='macro', zero_division=0)
    f1 = f1_score(label, pred, average='macro', zero_division=0)
    #     sat_result = (acc, precision, sk_recall, f1)

    recall = [[0, 0] for _ in range(sat_num)]
    for p, l in zip(pred, label):
        recall[l][1] += 1
        recall[l][0] += int(p == l)
    recall_value = [item[0] / max(item[1], 1) for item in recall]

    UAR = sum(recall_value) / len(recall_value)
    kappa = cohen_kappa_score(pred, label)
    rho = spearman(pred, label)

    bi_pred = [int(item < sat_num // 2) for item in pred]
    bi_label = [int(item < sat_num // 2) for item in label]
    bi_recall = sum([int(p == l) for p, l in zip(bi_pred, bi_label) if l == 1]) / max(bi_label.count(1), 1)
    bi_precision = sum([int(p == l) for p, l in zip(bi_pred, bi_label) if p == 1]) / max(bi_pred.count(1), 1)
    bi_f1 = 2 * bi_recall * bi_precision / max((bi_recall + bi_precision), 1)

    sat_result = [UAR, kappa, rho, bi_f1, acc, precision, sk_recall, f1]
    return sat_result


In [None]:
def load_data(dataset_name):
    dirname = f'dataset/{dataset_name}'
    print("Reading", dataset_name, "dataset")
    
    result = dict()
    for set_name in ['train', 'valid', 'test']:
        total_conversations = 0
        data_list = list()
        with open(os.path.join(dirname, f'{set_name}_{dataset_name}.txt'), 'r', encoding='utf-8') as infile:
            for line in infile:
                items = line.strip('\n').split('\t')
                input_text = eval(items[0])
                sat = int(items[2])
                history = ''
                for text in input_text:
                    user_utt = text.split('|||')[0]
                    ai_utt = text.split('|||')[1]
                    if ai_utt:
                        history += f'\n\nUser: {user_utt}'
                        history += f'\n\nAssistant: {ai_utt}'

                if len(user_utt.strip()) > 0 and user_utt.strip() != 'OVERALL':
                    data_list.append({
                        'history': history.strip(),
                        'utterance': user_utt.strip(),
                        'full_conv': f'{history.strip()}\n\nHuman: {user_utt.strip()}',
                        'label': sat
                    })
                elif user_utt.strip() == 'OVERALL':
                    total_conversations += 1

            result[set_name] = pd.DataFrame(data_list)
        
        print('{} set, len: {} utterances, {} conversations'.format(set_name, len(result[set_name]), total_conversations))

    return result

In [None]:
# load datasets
mwoz_data = load_data('mwoz')
sgd_data = load_data('sgd')
redial_data = load_data('redial')

In [None]:
# label distribution
for name, dataset in [('mwoz', mwoz_data), ('sgd', sgd_data), ('redial', redial_data)]:
    total_examples = 0
    total_sat = 0
    total_neu = 0
    total_dis = 0
    for set_name in ['train', 'valid', 'test']:
        set_data = dataset[set_name]
        total_examples += len(set_data)
        total_sat += len(set_data[set_data['label'] == 2])
        total_neu += len(set_data[set_data['label'] == 1])
        total_dis += len(set_data[set_data['label'] == 0])
    print(f'ratios for {name}')
    print("{0:.1%}, {1:.1%}, {2:.1%}".format(total_sat/total_examples, total_neu/total_examples, total_dis/total_examples))

In [None]:
overall_output_file = "results.csv"
subscription_key = os.getenv("AZURE_OPENAI_API_KEY")
endpoint = os.getenv("COMPLETION_ENDPOINT_URL")
api_version = os.getenv("COMPLETION_API_VERSION")

model_name = "gpt-4.1-mini"

# Initialize Azure OpenAI client with key-based authentication
completion_client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version=api_version
)

In [None]:
def run_completion(messages, temperature, max_tokens):
    try:
        completion = completion_client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None,
            stream=False
        )
        return completion
    except Exception as e:
        print("FAILED COMPLETION: ")
        print(e)
        print(messages)
        raise e

# Zero-shot prompting

In [None]:
ZERO_SHOT_PROMPT = """Your task is to assess the user's satisfaction at the end of the given conversation. Choose only one of these labels:
- "satisfied" : The assistant's response successfully meets the user's needs and the user appears satisfied.
- "dissatisfied" : The user's needs are not met and they appear dissatisfied.
- "neutral" : Neither satisfied nor dissatisfied, or the conversation is purely informational, routine, or a greeting.

Output only one word: "satisfied", "dissatisfied", or "neutral". Do not add any explanation or extra text.

# conversation
{full_conv}
"""

In [None]:
response_mapping = {
    'satisfied': 2,
    'neutral': 1,
    'dissatisfied': 0
}

In [None]:
def predict_zs(full_conv):
    response = run_completion(
            messages=[{"role": "user", "content": [{"type": "text", "text": ZERO_SHOT_PROMPT.format(full_conv=full_conv)}]}],
            temperature=0.7,
            max_tokens=5
        )
    return response_mapping.get(response.choices[0].message.content.strip(), 2)

In [None]:
tqdm.pandas()
mwoz_data['test']['zs_pred'] = mwoz_data['test']['full_conv'].progress_apply(predict_zs)
sat_evaluation(mwoz_data['test']['zs_pred'].values, mwoz_data['test']['label'].values, sat_num=3)

In [None]:
tqdm.pandas()
sgd_data['test']['zs_pred'] = sgd_data['test']['full_conv'].progress_apply(predict_zs)
sat_evaluation(sgd_data['test']['zs_pred'].values, sgd_data['test']['label'].values, sat_num=3)

In [None]:
tqdm.pandas()
redial_data['test']['zs_pred'] = redial_data['test']['full_conv'].progress_apply(predict_zs)
sat_evaluation(redial_data['test']['zs_pred'].values, redial_data['test']['label'].values, sat_num=3)

# Few-shot prompting

In [None]:
def get_formatted_examples(df_train):
    pos_ex = df_train[df_train['label'] == 2].sample(n=1).iloc[0]
    neu_ex = df_train[df_train['label'] == 1].sample(n=1).iloc[0]
    neg_ex = df_train[df_train['label'] == 0].sample(n=1).iloc[0]
    return f"""### Example 1
#conversation
{pos_ex['full_conv']}

# answer
satisfied

---

### Example 2
#conversation
{neu_ex['full_conv']}

# answer
neutral

---

### Example 3
#conversation
{neg_ex['full_conv']}

# answer
dissatisfied
"""

In [None]:
mwoz_examples = get_formatted_examples(mwoz_data['train'])
sgd_examples = get_formatted_examples(sgd_data['train'])
redial_examples = get_formatted_examples(redial_data['train'])

In [None]:
print(mwoz_examples)

In [None]:
FEW_SHOT_PROMPT = """Your task is to assess the user's satisfaction at the end of the given conversation. Choose only one of these labels:

- "satisfied": The assistant's response successfully meets the user's needs and the user appears satisfied.
- "dissatisfied": The user's needs are not met and they appear dissatisfied.
- "neutral": Neither satisfied nor dissatisfied, or the conversation is purely informational, routine, or a greeting.

Output only one word: "satisfied", "dissatisfied", or "neutral". Do not add any explanation or extra text.

## Examples

{examples}
---


## Now evaluate the following conversation

# conversation
{full_conv}

# answer"""

In [None]:
def predict_fs(full_conv, data_examples):
    response = run_completion(
            messages=[{"role": "user", "content": [{"type": "text", "text": FEW_SHOT_PROMPT.format(full_conv=full_conv, examples=data_examples)}]}],
            temperature=0.7,
            max_tokens=5
        )
    return response_mapping.get(response.choices[0].message.content.strip(), 2)

In [None]:
tqdm.pandas()
mwoz_data['test']['fs_pred'] = mwoz_data['test']['full_conv'].progress_apply(lambda conv: predict_fs(conv, mwoz_examples))
sat_evaluation(mwoz_data['test']['fs_pred'].values, mwoz_data['test']['label'].values, sat_num=3)

In [None]:
tqdm.pandas()
sgd_data['test']['fs_pred'] = sgd_data['test']['full_conv'].progress_apply(lambda conv: predict_fs(conv, sgd_examples))
sat_evaluation(sgd_data['test']['fs_pred'].values, sgd_data['test']['label'].values, sat_num=3)

In [None]:
tqdm.pandas()
redial_data['test']['fs_pred'] = redial_data['test']['full_conv'].progress_apply(lambda conv: predict_fs(conv, mwoz_examples))
sat_evaluation(redial_data['test']['fs_pred'].values, redial_data['test']['label'].values, sat_num=3)

# Save predictions

In [None]:
mwoz_data['test'].to_csv('zero-few-shot-predictions-mwoz.csv', index=False)
sgd_data['test'].to_csv('zero-few-shot-predictions-sgd.csv', index=False)
redial_data['test'].to_csv('zero-few-shot-predictions-redial.csv', index=False)