# Echo Results Review Try 2

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set up working directory
import os
WORKING_DIR = '/content/drive/MyDrive/echo_training/'  # Change this to your preferred location
os.makedirs(WORKING_DIR, exist_ok=True)
os.chdir(WORKING_DIR)

In [None]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [None]:
train_df = pd.read_csv('echo_train.csv')
tune_df = pd.read_csv('echo_tune.csv')
test_df = pd.read_csv('echo_test.csv')

In [None]:
test_df_copy = test_df.copy()

In [None]:
test_df_copy = test_df_copy.rename(columns={test_df_copy.columns[0]: 'id_num'})

In [None]:

# ==============================================================================
# SETUP: LOAD MODEL AND DEFINE LABELS
# ==============================================================================

# Define label names
LABEL_NAMES = [
    'LA_cavity', 'RA_dilated', 'LV_systolic', 'LV_cavity',
    'LV_wall', 'RV_cavity', 'RV_systolic', 'AV_stenosis',
    'MV_stenosis', 'TV_regurgitation', 'TV_stenosis',
    'TV_pulm_htn', 'AV_regurgitation', 'MV_regurgitation',
    'RA_pressure', 'LV_diastolic', 'RV_volume_overload',
    'RV_wall', 'RV_pressure_overload'
]

# Load the fine-tuned model
model_path = "final_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

In [None]:

# ==============================================================================
# INFERENCE FUNCTION
# ==============================================================================

def generate_prediction(text):
    prompt = f"""<start_of_turn>user
Analyze this echocardiogram report and provide assessment values for each cardiac feature. Output should be in the format "feature: value" for each of the 19 features.

Report:
{text}<end_of_turn>
<start_of_turn>model
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.1,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract just the model's response
    model_output = full_output.split("<start_of_turn>model\n")[-1].strip()
    return model_output


In [None]:
# ==============================================================================
# PARSE PREDICTIONS - SIMPLER VERSION
# ==============================================================================

def parse_prediction(pred_text):
    """Extract predicted label values from model output text"""
    predicted = []
    lines = pred_text.split('\n')

    for label_name in LABEL_NAMES:
        found = False
        for line in lines:
            if label_name in line and ':' in line:
                try:
                    # Get text after colon, remove spaces, convert to int
                    value_str = line.split(':')[1].strip()
                    value = int(value_str)
                    predicted.append(value)
                    found = True
                    break
                except:
                    pass

        if not found:
            predicted.append(None)

    return predicted

In [None]:
# ==============================================================================
# RUN INFERENCE ON ALL TEST EXAMPLES
# ==============================================================================

results = []

print(f"Running inference on {len(test_df_copy)} test examples...")
for idx in tqdm(range(len(test_df_copy))):
    # Get data
    echo_text = test_df_copy.iloc[idx]['text']
    true_labels_raw = test_df_copy.iloc[idx]['labels']
    id_num = test_df_copy.iloc[idx]['id_num']

    # Parse true labels
    if isinstance(true_labels_raw, str):
        true_labels = ast.literal_eval(true_labels_raw)
    else:
        true_labels = true_labels_raw

    # Generate prediction
    pred_text = generate_prediction(echo_text)

    # Store results
    result = {
        'idx': idx,
        'id_num': id_num,
        'echo_text': echo_text,
        'true_labels': true_labels,
        'prediction_text': pred_text
    }

    results.append(result)

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Save results
results_df.to_csv('test_inference_results.csv', index=False)
print(f"\nSaved results to test_inference_results.csv")
print(f"Shape: {results_df.shape}")

In [None]:

# ==============================================================================
# PART 1: LABEL DISTRIBUTION IN TEST SET
# ==============================================================================

print("\n" + "="*70)
print("LABEL DISTRIBUTION IN TEST SET")
print("="*70)

for i, label_name in enumerate(LABEL_NAMES):
    print(f"\n{label_name}:")

    # Extract the i-th value from each label list
    label_values = []
    for idx in range(len(test_df_copy)):
        labels_raw = test_df_copy.iloc[idx]['labels']

        # Parse if string
        if isinstance(labels_raw, str):
            labels = ast.literal_eval(labels_raw)
        else:
            labels = labels_raw

        label_values.append(labels[i])

    # Count values
    value_counts = pd.Series(label_values).value_counts().sort_index()
    null_count = pd.Series(label_values).isna().sum()
    total = len(label_values)

    for value, count in value_counts.items():
        pct = (count/total)*100
        print(f"  {value:>3}: {count:>5} ({pct:>5.1f}%)")
    if null_count > 0:
        pct = (null_count/total)*100
        print(f"  Null: {null_count:>5} ({pct:>5.1f}%)")

In [None]:

# ==============================================================================
# PART 2: CALCULATE ACCURACY
# ==============================================================================

# Parse all predictions
print("\n\nParsing predictions...")
results_df['pred_labels'] = results_df['prediction_text'].apply(parse_prediction)

print("\n" + "="*70)
print("ACCURACY BY LABEL")
print("="*70)

accuracy_results = []

for i, label_name in enumerate(LABEL_NAMES):
    # Extract true values (i-th element from true_labels list)
    true_vals = results_df['true_labels'].apply(lambda x: x[i] if i < len(x) else None).values

    # Extract predicted values
    pred_vals = results_df['pred_labels'].apply(lambda x: x[i] if x and i < len(x) else None).values

    # Remove any None predictions
    valid_mask = ~pd.isna(pred_vals)
    true_vals_valid = true_vals[valid_mask]
    pred_vals_valid = pred_vals[valid_mask]

    # Calculate accuracy
    correct = (true_vals_valid == pred_vals_valid).sum()
    total = len(true_vals_valid)
    accuracy = correct / total if total > 0 else 0

    # Count unparseable predictions
    unparseable = (~valid_mask).sum()

    accuracy_results.append({
        'label': label_name,
        'correct': correct,
        'total': total,
        'accuracy': accuracy,
        'unparseable': unparseable
    })

    print(f"\n{label_name}:")
    print(f"  Correct: {correct}/{total} = {accuracy:.3f}")
    if unparseable > 0:
        print(f"  Unparseable: {unparseable}")


In [None]:
# Create accuracy summary DataFrame
accuracy_df = pd.DataFrame(accuracy_results)

# Overall exact match accuracy
exact_matches = sum(1 for idx in range(len(results_df))
                    if results_df.iloc[idx]['true_labels'] == results_df.iloc[idx]['pred_labels'])
print("\n" + "="*70)
print(f"EXACT MATCH (all 19 labels correct): {exact_matches}/{len(results_df)} = {exact_matches/len(results_df):.3f}")
print("="*70)

# Save accuracy results
accuracy_df.to_csv('label_accuracy.csv', index=False)
print("\nAccuracy results saved to label_accuracy.csv")

# Display summary
print("\nACCURACY SUMMARY:")
print(accuracy_df.to_string(index=False))