## This is the the comparison of the Test Results for GAMA using our data

In [1]:
import json
import pandas as pd
import re

In [2]:
with open('stage5_answers_qformer_all.json', 'r') as f:
    data = json.load(f)

print(data[1])

{'audio_id': '/data/amey_2311cs10/AudioLM_baselines/GAMA/test_mentalhealth_16kHz/jm6PG989Q_0_chunk78_data1_task1.wav', 'instruction': 'Does the patient suffer from loneliness?', 'prediction': '<Answer>Yes, the patient suffers from loneliness.</Answer><Reasoning>The patient describes a situation where they are trying to help someone but feel unsupported and ignored. The mention of being told "you\'re not going to be there for me" indicates feelings of isolation and rejection. This suggests that the patient may be experiencing loneliness due to their inability to connect with others when they need it most. The emotional pain associated with these interactions highlights a sense of disconnection and lack of support, which can contribute to feelings of loneliness.</Reasoning>', 'ref': '<Answer>Yes, the patient suffers from loneliness.</Answer><Reasoning>The patient expresses feelings of helplessness and emotional pain related to someone they care about leaving them for another partner. The

In [3]:
def extract_between_tags(text, tag):
    pattern = fr'<{tag}>(.*?)</{tag}>'
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else ""

# Process rows
rows = []
for item in data:
    audio_id = item['audio_id']
    instruction = item['instruction']
    pred_ans = extract_between_tags(item['prediction'], 'Answer')
    pred_reason = extract_between_tags(item['prediction'], 'Reasoning')
    ref_ans = extract_between_tags(item['ref'], 'Answer')
    ref_reason = extract_between_tags(item['ref'], 'Reasoning')
    
    rows.append({
        'audio_id': audio_id,
        'instruction': instruction,
        'pred_ans': pred_ans,
        'pred_reason': pred_reason,
        'ref_ans': ref_ans,
        'ref_reason': ref_reason
    })

# Convert to DataFrame
df = pd.DataFrame(rows)

# Print first row of the DataFrame
df.head(5)

Unnamed: 0,audio_id,instruction,pred_ans,pred_reason,ref_ans,ref_reason
0,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,What cause of depression does this show?,This patient shows causes of depression relate...,The text suggests a concern about the potentia...,This patient shows causes of depression relate...,The text raises a question about feeling at ri...
1,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient describes a situation where they a...,"Yes, the patient suffers from loneliness.",The patient expresses feelings of helplessness...
2,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The patient expresses feelings of sadness and ...,"Yes, this wellness dimension exists here.",The individual expresses a recognition of fluc...
3,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The patient expresses a desire to avoid situat...,"Yes, this wellness dimension exists here.",The patient expresses a sense of foreboding an...
4,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,What is the stress cause here?,This patient shows the stress cause related to...,The text indicates that the individual is expe...,This patient shows the stress cause related to...,"The text indicates that the patient, Julie, ha..."


In [4]:
# save as csv
df.to_csv("gama_test_results.csv", index=False)

## Calculate BLEU and COMET scores

In [18]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

smoothie = SmoothingFunction().method4

# from comet import download_model, load_from_checkpoint

In [19]:
# # BLEU Score Calculation
# def compute_bleu(reference, prediction):
#     smoothie = SmoothingFunction().method4
#     return sentence_bleu([reference.split()], prediction.split(), smoothing_function=smoothie)

# df['bleu'] = df.apply(lambda row: compute_bleu(row['ref_ans'], row['pred_ans']), axis=1)

#### For Answer part

In [16]:
# Convert to list of tokenized references and hypotheses
references = [[ref.split()] for ref in df['ref_ans']]
hypotheses = [pred.split() for pred in df['pred_ans']]

# Compute BLEU scores
bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0), smoothing_function=smoothie)
bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

print(f"BLEU-1: {bleu1:.4f}")
print(f"BLEU-2: {bleu2:.4f}")
print(f"BLEU-3: {bleu3:.4f}")
print(f"BLEU-4: {bleu4:.4f}")

BLEU-1: 0.8251
BLEU-2: 0.8007
BLEU-3: 0.7782
BLEU-4: 0.7547


The small decline across BLEU-1 → BLEU-4 is expected and shows it maintains context across longer spans.

#### For Reasoning part

In [17]:
# Convert to list of tokenized references and hypotheses
references = [[ref.split()] for ref in df['ref_reason']]
hypotheses = [pred.split() for pred in df['pred_reason']]

# Compute BLEU scores
bleu1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0), smoothing_function=smoothie)
bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

print(f"BLEU-1: {bleu1:.4f}")
print(f"BLEU-2: {bleu2:.4f}")
print(f"BLEU-3: {bleu3:.4f}")
print(f"BLEU-4: {bleu4:.4f}")

BLEU-1: 0.4027
BLEU-2: 0.2334
BLEU-3: 0.1535
BLEU-4: 0.1030


#### Interpretation
<span style="color:red">
Model struggles to get longer, fluent, or syntactically correct phrases that match the reference. The model may understand vocabulary but not generate fluent or grammatically correct phrases. Could be an issue with fluency, word order, or context understanding.
</span> 

# Comet scores

In [5]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)


  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|████████████████████████| 5/5 [00:00<00:00, 67001.66it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/data/amey_2311cs10/anaconda3/envs/condapy312/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


#### Comet for Answer part

In [24]:
comet_ans_data = [
    {"src": "", "mt": pred, "ref": ref}
    for pred, ref in zip(df["pred_ans"], df["ref_ans"])
]

In [26]:
model_output = model.predict(comet_ans_data, batch_size=8, gpus=1)  # Set gpus=0 if no GPU

# Individual scores
comet_scores = model_output.scores

# Average score
avg_comet = sum(comet_scores) / len(comet_scores)

print(f"Average COMET score: {avg_comet:.4f}")

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA H100 PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|████████████████| 225/225 [00:08<00:00, 26.58it/s]


Average COMET score: 0.8836


#### Comet for Reasoning part

In [29]:
comet_reason_data = [
    {"src": "", "mt": pred, "ref": ref}
    for pred, ref in zip(df["pred_reason"], df["ref_reason"])
]

In [31]:
model_output = model.predict(comet_reason_data, batch_size=8, gpus=1)  # Set gpus=0 if no GPU

# Individual scores
comet_scores = model_output.scores

# Average score
avg_comet = sum(comet_scores) / len(comet_scores)

print(f"Average COMET score: {avg_comet:.4f}")

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|████████████████| 225/225 [00:17<00:00, 13.19it/s]


Average COMET score: 0.7157


# COMET on both Answer and Reasoning combined

In [3]:
# Process rows
rows = []
for item in data:
    audio_id = item['audio_id']
    instruction = item['instruction']
    pred = item['prediction']
    ref = item['ref']
    
    rows.append({
        'audio_id': audio_id,
        'instruction': instruction,
        'pred': pred,
        'ref': ref
    })

# Convert to DataFrame
df = pd.DataFrame(rows)

# Print first row of the DataFrame
df.head(5)

Unnamed: 0,audio_id,instruction,pred,ref
0,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,What cause of depression does this show?,<Answer>This patient shows causes of depressio...,<Answer>This patient shows causes of depressio...
1,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,Does the patient suffer from loneliness?,"<Answer>Yes, the patient suffers from loneline...","<Answer>Yes, the patient suffers from loneline..."
2,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,Does the emotional wellness dimension exist here?,"<Answer>Yes, this wellness dimension exists he...","<Answer>Yes, this wellness dimension exists he..."
3,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,Does the emotional wellness dimension exist here?,"<Answer>Yes, this wellness dimension exists he...","<Answer>Yes, this wellness dimension exists he..."
4,/data/amey_2311cs10/AudioLM_baselines/GAMA/tes...,What is the stress cause here?,<Answer>This patient shows the stress cause re...,<Answer>This patient shows the stress cause re...


In [6]:
# Prepare data in COMET format
comet_data = [
    {
        "src": "",  # Source left empty
        "mt": df.loc[i, "pred"],
        "ref": df.loc[i, "ref"]
    }
    for i in range(len(df))
]

In [7]:
model_output = model.predict(comet_data, batch_size=8, gpus=1)  # Set gpus=0 if no GPU

# Individual scores
comet_scores = model_output.scores

# Average score
avg_comet = sum(comet_scores) / len(comet_scores)

print(f"Average COMET score: {avg_comet:.4f}")

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA H100 PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|████████████████| 225/225 [00:19<00:00, 11.39it/s]


Average COMET score: 0.7524


# Chunked SBERT + Max Matching + Average

In [7]:
import nltk
nltk.download('punkt')
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /data/amey_2311cs10/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # or 'all-mpnet-base-v2' for better quality

In [8]:
# Fallback if nltk not available
def safe_sent_tokenize(text):
    try:
        import nltk
        return nltk.sent_tokenize(text)
    except:
        # Basic sentence split fallback
        return [s.strip() for s in text.split('.') if s.strip()]

In [9]:
# SBERT similarity function with fallback tokenization
def sbert_similarity(text1, text2):
    sents1 = safe_sent_tokenize(text1)
    sents2 = safe_sent_tokenize(text2)

    emb1 = model.encode(sents1, convert_to_tensor=True)
    emb2 = model.encode(sents2, convert_to_tensor=True)

    sim_matrix = util.pytorch_cos_sim(emb1, emb2)
    ref_to_pred = sim_matrix.max(dim=1).values.mean().item()
    pred_to_ref = sim_matrix.max(dim=0).values.mean().item()

    return (ref_to_pred + pred_to_ref) / 2

# Assuming df has 'reference' and 'predicted' columns
similarities = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        score = sbert_similarity(row['ref'], row['pred'])
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        score = None
    similarities.append(score)



# Add to DataFrame
df['sbert_similarity'] = similarities

# Compute and print mean similarity (excluding None)
valid_scores = [s for s in similarities if s is not None]
mean_score = sum(valid_scores) / len(valid_scores)
print(f"Mean SBERT Semantic Similarity: {mean_score:.4f}")

100%|███████████████████████████████████████| 1800/1800 [00:21<00:00, 82.24it/s]

Mean SBERT Semantic Similarity: 0.7419



