# This is the the comparison of the Test Results for Salmon using our data
### since Salmonn doesnt give any Reasoning, we will compare the ground truth Answer with the model predictions

In [3]:
import json
from pathlib import Path
import pandas as pd
import re

In [7]:
# Load the JSON file
with open('salmonn_outputs.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Rename 'output' column to 'predicted' and replace <s> and </s> tags
df.rename(columns={'output': 'predicted'}, inplace=True)
df['predicted'] = df['predicted'].str.replace('<s>', '<Answer>', regex=False).str.replace('</s>', '</Answer>', regex=False)

# Display the DataFrame
df.head()

Unnamed: 0,path,question,predicted
0,/data/amey_2311cs10/debayan/test_mentalhealth_...,What cause of depression does this show?,<Answer> This show does not provide enough inf...
1,/data/amey_2311cs10/debayan/test_mentalhealth_...,Does the patient suffer from loneliness?,"<Answer> Yes, the patient mentions feeling com..."
2,/data/amey_2311cs10/debayan/test_mentalhealth_...,Does the emotional wellness dimension exist here?,"<Answer> Yes, the emotional wellness dimension..."
3,/data/amey_2311cs10/debayan/test_mentalhealth_...,Does the emotional wellness dimension exist here?,"<Answer> Yes, the emotional wellness dimension..."
4,/data/amey_2311cs10/debayan/test_mentalhealth_...,What is the stress cause here?,<Answer> The stress cause here is social anxie...


In [12]:
# --- Load Ground Truth Annotations ---
with open('data_test_salmonn_annotations.json', 'r') as f:
    gt_data = json.load(f)

# Extract list of annotations
annotations = gt_data['annotation']

# Create a dictionary mapping from 'path' to the extracted <Answer>...</Answer> part
def extract_answer(text):
    match = re.search(r"<Answer>.*?</Answer>", text, re.DOTALL)
    return match.group(0).strip() if match else None

path_to_answer = {
    item['path']: extract_answer(item['text']) for item in annotations
}

# --- Map Reference Answers into DataFrame ---
df['reference'] = df['path'].map(path_to_answer)

# Preview result
df.head()

Unnamed: 0,path,question,predicted,reference
0,/data/amey_2311cs10/debayan/test_mentalhealth_...,What cause of depression does this show?,<Answer> This show does not provide enough inf...,<Answer> This patient shows causes of depressi...
1,/data/amey_2311cs10/debayan/test_mentalhealth_...,Does the patient suffer from loneliness?,"<Answer> Yes, the patient mentions feeling com...","<Answer> Yes, the patient suffers from lonelin..."
2,/data/amey_2311cs10/debayan/test_mentalhealth_...,Does the emotional wellness dimension exist here?,"<Answer> Yes, the emotional wellness dimension...","<Answer> Yes, this wellness dimension exists h..."
3,/data/amey_2311cs10/debayan/test_mentalhealth_...,Does the emotional wellness dimension exist here?,"<Answer> Yes, the emotional wellness dimension...","<Answer> Yes, this wellness dimension exists h..."
4,/data/amey_2311cs10/debayan/test_mentalhealth_...,What is the stress cause here?,<Answer> The stress cause here is social anxie...,<Answer> This patient shows the stress cause r...


## COMET scores

In [13]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 72565.81it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/data/amey_2311cs10/anaconda3/envs/condapy312/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [16]:
# Prepare data in COMET format
comet_reason_data = [
    {
        "src": "",  # Source left empty
        "mt": df.loc[i, "predicted"],
        "ref": df.loc[i, "reference"]
    }
    for i in range(len(df))
]

In [19]:
model_output = model.predict(comet_reason_data, batch_size=8, gpus=1)  # Set gpus=0 if no GPU

# Individual scores
comet_scores = model_output.scores

# Average score
avg_comet = sum(comet_scores) / len(comet_scores)

print(f"Average COMET score: {avg_comet:.4f}")

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 225/225 [00:09<00:00, 23.41it/s]


Average COMET score: 0.7905


# New Salmonn output with Reasoning

In [6]:
import json
from pathlib import Path
import pandas as pd
import re

In [7]:
# Load the JSON file
with open('salmonn_outputs_with_reasoning.json', 'r') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

df.rename(columns={'output': 'predicted'}, inplace=True)
# df['predicted'] = df['predicted'].str.replace('<s>', '<Answer>', regex=False).str.replace('</s>', '</Answer>', regex=False)

# Display the DataFrame
df.head()

Unnamed: 0,path,question,predicted
0,./test_mentalhealth_16kHz/mfe8OzzArGc_chunk29_...,What cause of depression does this show?\n\nBe...,<s> <Answer> This patient shows the stress cau...
1,./test_mentalhealth_16kHz/jm6PG989Q_0_chunk78_...,Does the patient suffer from loneliness?\n\nBe...,"<s> Yes, the patient suffers from stress.\n\nR..."
2,./test_mentalhealth_16kHz/jkKm5Cym-ZY_chunk4_d...,Does the emotional wellness dimension exist he...,"<s> Yes, this wellness dimension exists here. ..."
3,./test_mentalhealth_16kHz/jlXmVqhaMds_chunk5_d...,Does the emotional wellness dimension exist he...,"<s> Yes, this wellness dimension exists here. ..."
4,./test_mentalhealth_16kHz/-GaYArbWL1Q_chunk11_...,What is the stress cause here?\n\nBelow is an ...,<s> This patient shows the stress cause relate...


In [10]:
# --- Load Ground Truth Annotations ---
with open('test_mentalhealth_salmonn.json', 'r') as f:
    gt_data = json.load(f)

# Extract list of annotations
annotations = gt_data['annotation']

path_to_answer = {
    item['path']: item['text'] for item in annotations
}

# --- Map Reference Answers into DataFrame ---
df['reference'] = df['path'].map(path_to_answer)

# Preview result
df.head()

Unnamed: 0,path,question,predicted,reference
0,./test_mentalhealth_16kHz/mfe8OzzArGc_chunk29_...,What cause of depression does this show?\n\nBe...,<s> <Answer> This patient shows the stress cau...,<Answer> This patient shows causes of depressi...
1,./test_mentalhealth_16kHz/jm6PG989Q_0_chunk78_...,Does the patient suffer from loneliness?\n\nBe...,"<s> Yes, the patient suffers from stress.\n\nR...","<Answer> Yes, the patient suffers from lonelin..."
2,./test_mentalhealth_16kHz/jkKm5Cym-ZY_chunk4_d...,Does the emotional wellness dimension exist he...,"<s> Yes, this wellness dimension exists here. ...","<Answer> Yes, this wellness dimension exists h..."
3,./test_mentalhealth_16kHz/jlXmVqhaMds_chunk5_d...,Does the emotional wellness dimension exist he...,"<s> Yes, this wellness dimension exists here. ...","<Answer> Yes, this wellness dimension exists h..."
4,./test_mentalhealth_16kHz/-GaYArbWL1Q_chunk11_...,What is the stress cause here?\n\nBelow is an ...,<s> This patient shows the stress cause relate...,<Answer> This patient shows the stress cause r...


### COMET scores

In [8]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 11178.85it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/data/amey_2311cs10/anaconda3/envs/condapy312/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [9]:
# Prepare data in COMET format
comet_reason_data = [
    {
        "src": "",  # Source left empty
        "mt": df.loc[i, "predicted"],
        "ref": df.loc[i, "reference"]
    }
    for i in range(len(df))
]

In [10]:
model_output = model.predict(comet_reason_data, batch_size=8, gpus=1)  # Set gpus=0 if no GPU

# Individual scores
comet_scores = model_output.scores

# Average score
avg_comet = sum(comet_scores) / len(comet_scores)

print(f"Average COMET score: {avg_comet:.4f}")

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA H100 PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 225/225 [00:19<00:00, 11.67it/s]


Average COMET score: 0.5936


# Chunked SBERT + Max Matching + Average

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /data/amey_2311cs10/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # or 'all-mpnet-base-v2' for better quality

In [5]:
# Fallback if nltk not available
def safe_sent_tokenize(text):
    try:
        import nltk
        return nltk.sent_tokenize(text)
    except:
        # Basic sentence split fallback
        return [s.strip() for s in text.split('.') if s.strip()]

In [11]:
# SBERT similarity function with fallback tokenization
def sbert_similarity(text1, text2):
    sents1 = safe_sent_tokenize(text1)
    sents2 = safe_sent_tokenize(text2)

    emb1 = model.encode(sents1, convert_to_tensor=True)
    emb2 = model.encode(sents2, convert_to_tensor=True)

    sim_matrix = util.pytorch_cos_sim(emb1, emb2)
    ref_to_pred = sim_matrix.max(dim=1).values.mean().item()
    pred_to_ref = sim_matrix.max(dim=0).values.mean().item()

    return (ref_to_pred + pred_to_ref) / 2

# Assuming df has 'reference' and 'predicted' columns
similarities = []

for idx, row in df.iterrows():
    try:
        score = sbert_similarity(row['reference'], row['predicted'])
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        score = None
    similarities.append(score)



# Add to DataFrame
df['sbert_similarity'] = similarities

# Compute and print mean similarity (excluding None)
valid_scores = [s for s in similarities if s is not None]
mean_score = sum(valid_scores) / len(valid_scores)
print(f"\nMean SBERT Semantic Similarity: {mean_score:.4f}")


Mean SBERT Semantic Similarity: 0.5147
