## This is the the comparison of the Test Results for Qwen2Audio using our data

In [4]:
import pandas as pd

In [7]:
ref = pd.read_csv("test_mentalhealth.csv")
ref['Reference'] = ref['Answer'].astype(str) + " " + ref['Reasoning'].astype(str)
ref.head()

Unnamed: 0,file_id,Question,Answer,Reasoning,Reference
0,mfe8OzzArGc_chunk29_data1_task3,What cause of depression does this show?,This patient shows causes of depression relate...,The text raises a question about feeling at ri...,This patient shows causes of depression relate...
1,jm6PG989Q_0_chunk78_data1_task1,Does the patient suffer from loneliness?,"Yes, the patient suffers from loneliness.",The patient expresses feelings of helplessness...,"Yes, the patient suffers from loneliness. The ..."
2,jkKm5Cym-ZY_chunk4_data2_task4,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The individual expresses a recognition of fluc...,"Yes, this wellness dimension exists here. The ..."
3,jlXmVqhaMds_chunk5_data2_task4,Does the emotional wellness dimension exist here?,"Yes, this wellness dimension exists here.",The patient expresses a sense of foreboding an...,"Yes, this wellness dimension exists here. The ..."
4,-GaYArbWL1Q_chunk11_data1_task3,What is the stress cause here?,This patient shows the stress cause related to...,"The text indicates that the patient, Julie, ha...",This patient shows the stress cause related to...


In [8]:
pred = pd.read_csv("qwen_inf_output.csv")
pred.head()

Unnamed: 0,file_id,Question,Generated
0,mfe8OzzArGc_chunk29_data1_task3,What cause of depression does this show?,The audio does not provide enough information ...
1,jm6PG989Q_0_chunk78_data1_task1,Does the patient suffer from loneliness?,"Yes, the patient feels helpless and alone as t..."
2,jkKm5Cym-ZY_chunk4_data2_task4,Does the emotional wellness dimension exist here?,"Based on the provided transcription, it is not..."
3,jlXmVqhaMds_chunk5_data2_task4,Does the emotional wellness dimension exist here?,"Yes, the speaker is expressing a negative emot..."
4,-GaYArbWL1Q_chunk11_data1_task3,What is the stress cause here?,The stressor seems to be related to a situatio...


## COMET scores

In [2]:
from comet import download_model, load_from_checkpoint

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|████████████████████████| 5/5 [00:00<00:00, 59409.41it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/data/amey_2311cs10/anaconda3/envs/condapy312/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [8]:
# Prepare data in COMET format
comet_reason_data = [
    {
        "src": "",  # Source left empty
        "mt": pred.loc[i, "Generated"],
        "ref": ref.loc[i, "Reference"]
    }
    for i in range(len(ref))
]

In [None]:
# # Run scoring
# model_output = model.predict(data, batch_size=8, gpus=1 if model.hparams.use_gpu else 0)

# # Get scores
# scores = model_output.scores

# # Add scores to the prediction DataFrame (or ref, as needed)
# pred['COMET_score'] = scores

In [9]:
model_output = model.predict(comet_reason_data, batch_size=8, gpus=1)  # Set gpus=0 if no GPU

# Individual scores
comet_scores = model_output.scores

# Average score
avg_comet = sum(comet_scores) / len(comet_scores)

print(f"Average COMET score: {avg_comet:.4f}")

Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA H100 PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Predicting DataLoader 0: 100%|████████████████| 225/225 [00:16<00:00, 13.26it/s]


Average COMET score: 0.5551


# Chunked SBERT + Max Matching + Average

In [1]:
import nltk
nltk.download('punkt')
import pandas as pd
from sentence_transformers import SentenceTransformer, util

[nltk_data] Downloading package punkt to
[nltk_data]     /data/amey_2311cs10/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # or 'all-mpnet-base-v2' for better quality

In [3]:
# Fallback if nltk not available
def safe_sent_tokenize(text):
    try:
        import nltk
        return nltk.sent_tokenize(text)
    except:
        # Basic sentence split fallback
        return [s.strip() for s in text.split('.') if s.strip()]

In [None]:
df = pd.merge(ref, pred, on='file_id', how='inner')

In [12]:
# SBERT similarity function with fallback tokenization
def sbert_similarity(text1, text2):
    sents1 = safe_sent_tokenize(text1)
    sents2 = safe_sent_tokenize(text2)

    emb1 = model.encode(sents1, convert_to_tensor=True)
    emb2 = model.encode(sents2, convert_to_tensor=True)

    sim_matrix = util.pytorch_cos_sim(emb1, emb2)
    ref_to_pred = sim_matrix.max(dim=1).values.mean().item()
    pred_to_ref = sim_matrix.max(dim=0).values.mean().item()

    return (ref_to_pred + pred_to_ref) / 2

# Assuming df has 'reference' and 'predicted' columns
similarities = []

for idx, row in df.iterrows():
    try:
        score = sbert_similarity(row['Reference'], row['Generated'])
    except Exception as e:
        print(f"Error at row {idx}: {e}")
        score = None
    similarities.append(score)



# Add to DataFrame
df['sbert_similarity'] = similarities

# Compute and print mean similarity (excluding None)
valid_scores = [s for s in similarities if s is not None]
mean_score = sum(valid_scores) / len(valid_scores)
print(f"Mean SBERT Semantic Similarity: {mean_score:.4f}")

Mean SBERT Semantic Similarity: 0.5474
