In [4]:
!pip install transformers datasets


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

# GPT-2 doesn't have a pad token by default
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
def create_prompt(example_text, few_shot_examples):
    prompt = "You are a dialogue act classifier. Classify the dialogue act of the given sentence.\n\n"
    for ex_text, ex_label in few_shot_examples:
        prompt += f"Sentence: {ex_text}\nLabel: {ex_label}\n\n"
    prompt += f"Sentence: {example_text}\nLabel:"
    return prompt


In [7]:
few_shot_mrda = [
    ("I think we should discuss the budget.", "Statement"),
    ("Okay, let's start with the first item.", "Backchannel"),
    ("Can you explain that again?", "Question"),
    ("Yes, I understand.", "Acknowledgment"),
    ("Thank you.", "Appreciation"),
]

# Inference
def classify(text, few_shot_examples):
    prompt = create_prompt(text, few_shot_examples)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("Label:")[-1].strip().split("\n")[0]

# Example sentence
test_sentence = "Could we go back to the earlier point?"
print("Predicted Label:", classify(test_sentence, few_shot_mrda))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted Label: Yes


In [8]:
few_shot_swda = [
    ("How are you doing today?", "Greeting"),
    ("I'm good, thank you!", "Statement"),
    ("Oh, really?", "Backchannel"),
    ("Do you know what time it is?", "Yes-No Question"),
    ("I don't think that's right.", "Disagreement"),
]
# Inference
def classify(text, few_shot_examples):
    prompt = create_prompt(text, few_shot_examples)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("Label:")[-1].strip().split("\n")[0]

# Example sentence
test_sentence = "Could we go back to the earlier point?"
print("Predicted Label:", classify(test_sentence, few_shot_swda))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted Label: Yes


In [9]:
few_shot_dyda = [
    ("Let's start the presentation.", "Inform"),
    ("What is the purpose of this study?", "Question"),
    ("That's a good idea.", "Agreement"),
    ("Please proceed to the next slide.", "Instruction"),
    ("I'm not sure.", "Uncertainty"),
]
# Inference
def classify(text, few_shot_examples):
    prompt = create_prompt(text, few_shot_examples)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("Label:")[-1].strip().split("\n")[0]

# Example sentence
test_sentence = "Could we go back to the earlier point?"
print("Predicted Label:", classify(test_sentence, few_shot_dyda))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted Label: Yes


In [10]:
test_samples = ["Could you clarify that?", "Alright, moving on.", "Why do you think that happened?"]

for sentence in test_samples:
    print(f"Sentence: {sentence}")
    print("Predicted:", classify(sentence, few_shot_mrda))
    print("-----")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sentence: Could you clarify that?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted: Question
-----
Sentence: Alright, moving on.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted: I'm sorry, I'm sorry.
-----
Sentence: Why do you think that happened?
Predicted: Because I was a little bit of a jerk.
-----


In [11]:
!git clone https://github.com/zihaohe123/speak-turn-emb-dialog-act-clf.git
%cd speak-turn-emb-dialog-act-clf


Cloning into 'speak-turn-emb-dialog-act-clf'...
remote: Enumerating objects: 55, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 55 (delta 27), reused 37 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (55/55), 7.30 MiB | 9.32 MiB/s, done.
Resolving deltas: 100% (27/27), done.
/content/speak-turn-emb-dialog-act-clf


In [12]:
!unzip data.zip



Archive:  data.zip
   creating: data/
  inflating: __MACOSX/._data         
   creating: data/mrda/
  inflating: __MACOSX/data/._mrda    
   creating: data/swda/
  inflating: __MACOSX/data/._swda    
   creating: data/dyda/
  inflating: __MACOSX/data/._dyda    
  inflating: data/mrda/train.csv     
  inflating: __MACOSX/data/mrda/._train.csv  
  inflating: data/mrda/val.csv       
  inflating: __MACOSX/data/mrda/._val.csv  
  inflating: data/mrda/test.csv      
  inflating: __MACOSX/data/mrda/._test.csv  
  inflating: data/swda/test.csv      
  inflating: __MACOSX/data/swda/._test.csv  
  inflating: data/swda/train.csv     
  inflating: __MACOSX/data/swda/._train.csv  
  inflating: data/swda/val.csv       
  inflating: __MACOSX/data/swda/._val.csv  
  inflating: data/dyda/val.csv       
  inflating: __MACOSX/data/dyda/._val.csv  
  inflating: data/dyda/test.csv      
  inflating: __MACOSX/data/dyda/._test.csv  
  inflating: data/dyda/train.csv     
  inflating: __MACOSX/data/dyda/._tra

In [13]:

import pandas as pd
mrda_test = pd.read_csv("data/mrda/test.csv")
mrda_test.head()


Unnamed: 0,speaker,text,act,conv_id,topic,topic_ldaconv,topic_ldautt
0,0,some some introductions are in order.,0,0,-1,-1,-1
1,1,oh okay.,0,0,-1,-1,-1
2,0,sorry.,0,0,-1,-1,-1
3,1,okay.,3,0,-1,-1,-1
4,1,getting ahead of myself.,0,0,-1,-1,-1


In [14]:
print(mrda_test.columns)


Index(['speaker', 'text', 'act', 'conv_id', 'topic', 'topic_ldaconv',
       'topic_ldautt'],
      dtype='object')


In [15]:
few_shot_mrda = list(mrda_test[['text', 'act']].iloc[:10].itertuples(index=False, name=None))

print(few_shot_mrda)


[('some some introductions are in order.', 0), ('oh okay.', 0), ('sorry.', 0), ('okay.', 3), ('getting ahead of myself.', 0), ('so', 3), ("um for those who don't know", 2), ('everyone knows me.', 0), ('this is great.', 0), ('yay!', 0)]


In [16]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Set pad token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id


In [17]:
def create_prompt(example_text, few_shot_examples):
    prompt = "You are a dialogue act classifier. Classify the dialogue act of the given sentence.\n\n"
    for ex_text, ex_label in few_shot_examples:
        prompt += f"Sentence: {ex_text}\nLabel: {ex_label}\n\n"
    prompt += f"Sentence: {example_text}\nLabel:"
    return prompt

def classify(text, few_shot_examples):
    prompt = create_prompt(text, few_shot_examples)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("Label:")[-1].strip().split("\n")[0]


In [18]:
label_map = {
    0: "Statement",
    1: "Backchannel",
    2: "Question",
    3: "Agreement",
    4: "Disruption"
}


In [19]:
print(mrda_test.columns)


Index(['speaker', 'text', 'act', 'conv_id', 'topic', 'topic_ldaconv',
       'topic_ldautt'],
      dtype='object')


In [20]:
# Correct label map for MRDA
label_map = {
    0: "Statement",
    1: "Backchannel",
    2: "Question",
    3: "Agreement",
    4: "Disruption"
}

# Use actual column names
test_sentence = mrda_test['text'].iloc[20]
true_label = label_map[mrda_test['act'].iloc[20]]

pred = classify(test_sentence, few_shot_mrda)

print("Sentence:", test_sentence)
print("True Label:", true_label)
print("Predicted Label:", pred)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sentence: i don't know.
True Label: Statement
Predicted Label: 0


In [21]:
swda_test = pd.read_csv("data/swda/test.csv")
swda_test.columns


Index(['speaker', 'text', 'act', 'conv_id', 'topic', 'topic_ldaconv',
       'topic_ldautt'],
      dtype='object')

In [22]:
swda_label_map = {
    0: "Statement",
    1: "Backchannel",
    2: "Question",
    3: "Agreement",
    4: "Disagreement",
    5: "Clarification",
    6: "Acknowledge",
    7: "Appreciation",
    8: "Other"
}


In [23]:
swda_test['act'].unique()  # Check available label IDs


array([ 8, 35,  6, 42, 29, 32, 23, 26, 11,  1, 38, 37,  9, 34, 18,  2, 28,
       36, 40, 39,  5,  3, 27,  4, 41, 16, 31, 25, 17, 19, 20, 12,  7, 10,
       15, 24, 21, 13, 14,  0])

In [24]:
unique_labels = sorted(swda_test['act'].unique())
swda_label_map = {i: f"Class_{i}" for i in unique_labels}


In [25]:
print(dict(list(swda_label_map.items())[:10]))


{np.int64(0): 'Class_0', np.int64(1): 'Class_1', np.int64(2): 'Class_2', np.int64(3): 'Class_3', np.int64(4): 'Class_4', np.int64(5): 'Class_5', np.int64(6): 'Class_6', np.int64(7): 'Class_7', np.int64(8): 'Class_8', np.int64(9): 'Class_9'}


In [26]:
few_shot_swda = [
    (text, swda_label_map[label])
    for text, label in zip(swda_test['text'][:10], swda_test['act'][:10])
]


In [27]:
test_sentence_swda = swda_test['text'].iloc[10]
true_label_swda = swda_label_map[swda_test['act'].iloc[10]]

pred_swda = classify(test_sentence_swda, few_shot_swda)

print("Sentence:", test_sentence_swda)
print("True Label:", true_label_swda)
print("Predicted Label:", pred_swda)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sentence: And that people aren't self serving.  /
True Label: Class_42
Predicted Label: Class_6


In [28]:
dyda_test = pd.read_csv("data/dyda/test.csv")
print(dyda_test.columns)

Index(['speaker', 'text', 'act', 'conv_id', 'topic', 'topic_ldaconv',
       'topic_ldautt'],
      dtype='object')


In [29]:
dyda_test['act'].unique()


array([2, 1, 3, 0])

In [30]:
dyda_label_map = {
    0: "Statement",
    1: "Question",
    2: "Command",
    3: "Backchannel"
}


In [31]:
few_shot_dyda = [
    (text, dyda_label_map[label])
    for text, label in zip(dyda_test['text'][:10], dyda_test['act'][:10])
]


In [32]:
test_sentence_dyda = dyda_test['text'].iloc[10]
true_label_dyda = dyda_label_map[dyda_test['act'].iloc[10]]

pred_dyda = classify(test_sentence_dyda, few_shot_dyda)

print("Sentence:", test_sentence_dyda)
print("True Label:", true_label_dyda)
print("Predicted Label:", pred_dyda)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sentence:  Yeah ? 
True Label: Question
Predicted Label: Backchannel


In [33]:
def evaluate_gpt2_accuracy(test_df, few_shot_examples, label_map, text_col="text", label_col="act", num_samples=100):
    correct = 0
    total = min(num_samples, len(test_df))

    for i in range(total):
        sentence = test_df[text_col].iloc[i]
        true_label_id = test_df[label_col].iloc[i]
        true_label = label_map[true_label_id]

        pred_label = classify(sentence, few_shot_examples)

        if pred_label.strip().lower() == true_label.strip().lower():
            correct += 1

    accuracy = correct / total
    print(f"Accuracy over {total} samples: {accuracy:.2f}")
    return accuracy


In [34]:
evaluate_gpt2_accuracy(mrda_test, few_shot_mrda, label_map)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Accuracy over 100 samples: 0.00


0.0

In [35]:
evaluate_gpt2_accuracy(swda_test, few_shot_swda, swda_label_map)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Accuracy over 100 samples: 0.45


0.45

In [36]:
evaluate_gpt2_accuracy(dyda_test, few_shot_dyda, dyda_label_map)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Accuracy over 100 samples: 0.25


0.25