<a href="https://colab.research.google.com/github/ermiasmikael/bert_fine_tuned/blob/master/new_model_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fuzzywuzzy



In [2]:
!pip install python-Levenshtein



In [3]:
# General imports
import os
import json
import warnings
from collections import Counter
import random

import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Data handling and preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# PyTorch and Transformers
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn.utils import clip_grad_norm_
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    AdamW,
    get_linear_schedule_with_warmup,
    BertTokenizer,
    DataCollatorWithPadding
)
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader

# LoRA (Parameter-Efficient Fine-Tuning)
from peft import LoraConfig, get_peft_model, PeftModel, PeftType, TaskType

# Logging and Debugging
import logging

# Suppress warnings and logging messages
warnings.filterwarnings('ignore')
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)


In [6]:
# Load, split in chunks, tensorize, train/test split
data = pd.read_csv("dsm5_disorders.csv")

# Step 1: Create label mapping for broad_category
label_mapping = {category: idx for idx, category in enumerate(data['broad_category'].unique())}
data['label_encoded'] = data['broad_category'].map(label_mapping)

# Save the label mapping
with open("label_mapping.json", "w") as f:
    json.dump(label_mapping, f)
print("Label mapping saved:", label_mapping)

# Verify encoded labels
print("Sample Data with Encoded Labels:")
print(data[['broad_category', 'label_encoded']].head())

# Step 2: Train/Test/Val Split (retain all columns)
train_data, temp_data = train_test_split(
    data,
    test_size=0.3,
    random_state=42,
    stratify=data['label_encoded']
)

test_data, val_data = train_test_split(
    temp_data,
    test_size=0.5,
    random_state=42,
    stratify=temp_data['label_encoded']
)


# Step 3: Tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Function: Split long text into chunks
def split_text_into_chunks(text, max_length, tokenizer):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

# Preprocess data (splitting long texts into chunks while retaining all columns)
def preprocess_data(data, tokenizer, max_length=512):
    new_records = []
    for _, row in data.iterrows():
        combined_text = f"{row['text']} {row['back_translated_text']}"
        text_chunks = split_text_into_chunks(combined_text, max_length - 2, tokenizer)  # Reserve space for special tokens
        for chunk in text_chunks:
            new_records.append({
                'text': chunk,
                'broad_category': row['broad_category'],
                'category': row['category'],
                'label_encoded': row['label_encoded']
            })
    return pd.DataFrame(new_records)

# Preprocess train and test data
train_data = preprocess_data(train_data, tokenizer)
val_data = preprocess_data(val_data, tokenizer)
test_data = preprocess_data(test_data, tokenizer)

# Step 4: Tokenize data
def tokenize_data(data, tokenizer):
    texts = data['text'].tolist()
    labels = data['label_encoded'].tolist()

    tokenized = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    features = []
    for i in range(len(labels)):
        features.append({
            "input_ids": tokenized["input_ids"][i],
            "attention_mask": tokenized["attention_mask"][i],
            "labels": torch.tensor(labels[i], dtype=torch.long)
        })
    return features

# Tokenize train, validation, and test data
train_features = tokenize_data(train_data, tokenizer)
val_features = tokenize_data(val_data, tokenizer)
test_features = tokenize_data(test_data, tokenizer)

# Step 5: DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_loader = DataLoader(train_features, batch_size=2, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_features, batch_size=2, shuffle=False, collate_fn=data_collator)
test_loader = DataLoader(test_features, batch_size=2, shuffle=False, collate_fn=data_collator)

# Step 6: Verify Tokenization
for batch in train_loader:
    print("Input IDs:", batch["input_ids"].shape)
    print("Attention Mask:", batch["attention_mask"].shape)
    print("Labels:", batch["labels"].shape)
    break

# Step 7: Check for label mismatches
mismatched_labels = []
for label in train_data['broad_category']:
    if label not in label_mapping:
        mismatched_labels.append(label)

if mismatched_labels:
    print("Mismatched Labels Found:", mismatched_labels)
else:
    print("All tokenized labels match the label_mapping keys!")

# Check unique labels in the dataset against label_mapping
unique_labels = train_data['broad_category'].unique()
missing_keys = [label for label in unique_labels if label not in label_mapping]
if missing_keys:
    print("Missing Keys in label_mapping:", missing_keys)
else:
    print("All dataset labels are covered by label_mapping!")


Label mapping saved: {'somatic symptom and related disorders': 0, 'motor disorders': 1, 'gender dysphoria': 2, 'medication-induced movement disorders and other adverse effects of medication': 3, 'dissociative disorders': 4, 'opioid-related disorders': 5, 'other conditions that may be a focus of clinical attention': 6, 'alcohol-induced mental disorders': 7, 'obsessive-compulsive and related disorders': 8, 'sexual dysfunctions': 9, 'neurodevelopmental disorders': 10, 'anxiety disorders': 11, 'trauma- and stressor-related disorders': 12, 'inhalant-related disorders': 13, 'neurocognitive disorders': 14, 'other mental disorders and additional codes': 15, 'personality disorders': 16, 'elimination disorders': 17, 'tobacco-related disorders': 18, 'paraphilic disorders': 19, 'bipolar and related disorders': 20, 'schizophrenia spectrum and other psychotic disorders': 21, 'sleep-wake disorders': 22, 'feeding and eating disorders': 23, 'depressive disorders': 24, 'disruptive, impulse-control, and 

In [18]:
# Load, split in chunks, tensorize, train/test split
data = pd.read_csv("dsm5_disorders.csv")

# Step 1: Create label mapping for broad_category
label_mapping = {category: idx for idx, category in enumerate(data['broad_category'].unique())}
data['label_encoded'] = data['broad_category'].map(label_mapping)

# Save the label mapping
with open("label_mapping.json", "w") as f:
    json.dump(label_mapping, f)
print("Label mapping saved:", label_mapping)

# Verify encoded labels
print("Sample Data with Encoded Labels:")
print(data[['broad_category', 'label_encoded']].head())

# Step 2: Train/Test/Val Split (retain all columns)
train_data, temp_data = train_test_split(
    data,
    test_size=0.3,
    random_state=42,
    stratify=data['label_encoded']
)

test_data, val_data = train_test_split(
    temp_data,
    test_size=0.5,
    random_state=42,
    stratify=temp_data['label_encoded']
)


# Step 3: Tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Function: Split long text into chunks
def split_text_into_chunks(text, max_length, tokenizer, overlap=0):
    tokens = tokenizer.tokenize(text)
    chunks = []
    stride = max_length - overlap
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + max_length]
        chunk_text = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk_text)
    return chunks

# Preprocess data (splitting long texts into chunks while retaining all columns)
def preprocess_data(data, tokenizer, max_length=512):
    new_records = []
    for _, row in data.iterrows():
        combined_text = f"{row['text']} {row['back_translated_text']}"
        text_chunks = split_text_into_chunks(combined_text, max_length - 2, tokenizer)  # Reserve space for special tokens
        for chunk in text_chunks:
            new_records.append({
                'text': chunk,
                'broad_category': row['broad_category'],
                'category': row['category'],
                'label_encoded': row['label_encoded']
            })
    return pd.DataFrame(new_records)

# Preprocess train and test data
train_data = preprocess_data(train_data, tokenizer)
val_data = preprocess_data(val_data, tokenizer)
test_data = preprocess_data(test_data, tokenizer)

def tokenize_data(data, tokenizer):
    texts = data['text'].tolist()  # This 'text' already contains back_translated_text
    labels = data['label_encoded'].tolist()
    broad_categories = data['broad_category'].tolist()
    categories = data['category'].tolist()

    # Tokenize 'text' (which already includes back_translated_text)
    tokenized_text = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,  # Adjust as needed
        return_tensors="pt"
    )

    # Tokenize 'broad_category'
    tokenized_broad_cat = tokenizer(
        broad_categories,
        padding=True,
        truncation=True,
        max_length=64,  # Adjust as needed
        return_tensors="pt"
    )

    # Tokenize 'category'
    tokenized_cat = tokenizer(
        categories,
        padding=True,
        truncation=True,
        max_length=64,  # Adjust as needed
        return_tensors="pt"
    )

    features = []
    for i in range(len(texts)):
        features.append({
            "input_ids": tokenized_text["input_ids"][i],
            "attention_mask": tokenized_text["attention_mask"][i],
            "broad_category_input_ids": tokenized_broad_cat["input_ids"][i],
            "broad_category_attention_mask": tokenized_broad_cat["attention_mask"][i],
            "category_input_ids": tokenized_cat["input_ids"][i],
            "category_attention_mask": tokenized_cat["attention_mask"][i],
            "labels": torch.tensor(labels[i], dtype=torch.long)
        })
    return features

# Tokenize train, validation, and test data
train_features = tokenize_data(train_data, tokenizer)
val_features = tokenize_data(val_data, tokenizer)
test_features = tokenize_data(test_data, tokenizer)

# Step 5: DataLoader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_loader = DataLoader(train_features, batch_size=2, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_features, batch_size=2, shuffle=False, collate_fn=data_collator)
test_loader = DataLoader(test_features, batch_size=2, shuffle=False, collate_fn=data_collator)

# Step 6: Verify Tokenization
for batch in train_loader:
    print("Input IDs:", batch["input_ids"].shape)
    print("Attention Mask:", batch["attention_mask"].shape)
    print("Labels:", batch["labels"].shape)
    break

# Step 7: Check for label mismatches
mismatched_labels = []
for label in train_data['broad_category']:
    if label not in label_mapping:
        mismatched_labels.append(label)

if mismatched_labels:
    print("Mismatched Labels Found:", mismatched_labels)
else:
    print("All tokenized labels match the label_mapping keys!")

# Check unique labels in the dataset against label_mapping
unique_labels = train_data['broad_category'].unique()
missing_keys = [label for label in unique_labels if label not in label_mapping]
if missing_keys:
    print("Missing Keys in label_mapping:", missing_keys)
else:
    print("All dataset labels are covered by label_mapping!")


Label mapping saved: {'somatic symptom and related disorders': 0, 'motor disorders': 1, 'gender dysphoria': 2, 'medication-induced movement disorders and other adverse effects of medication': 3, 'dissociative disorders': 4, 'opioid-related disorders': 5, 'other conditions that may be a focus of clinical attention': 6, 'alcohol-induced mental disorders': 7, 'obsessive-compulsive and related disorders': 8, 'sexual dysfunctions': 9, 'neurodevelopmental disorders': 10, 'anxiety disorders': 11, 'trauma- and stressor-related disorders': 12, 'inhalant-related disorders': 13, 'neurocognitive disorders': 14, 'other mental disorders and additional codes': 15, 'personality disorders': 16, 'elimination disorders': 17, 'tobacco-related disorders': 18, 'paraphilic disorders': 19, 'bipolar and related disorders': 20, 'schizophrenia spectrum and other psychotic disorders': 21, 'sleep-wake disorders': 22, 'feeding and eating disorders': 23, 'depressive disorders': 24, 'disruptive, impulse-control, and 

In [8]:
sample_size = 300
batch_size = 16

sample_indices = random.sample(range(len(train_features)), min(sample_size, len(train_features)))

sample_input_ids = [train_features[i]["input_ids"] for i in sample_indices]
sample_attention_masks = [train_features[i]["attention_mask"] for i in sample_indices]
sample_labels = [train_features[i]["labels"] for i in sample_indices]

sample_input_ids = torch.stack(sample_input_ids)
sample_attention_masks = torch.stack(sample_attention_masks)
sample_labels = torch.stack(sample_labels)

sample_dataloader = DataLoader(
    TensorDataset(sample_input_ids, sample_attention_masks, sample_labels),
    batch_size=batch_size,
    shuffle=False
)

print(f"Small sample dataset created with {len(sample_indices)} examples for pre-tuning evaluation.")

# Inspect the sample
for batch in sample_dataloader:
    print("Input IDs Shape:", batch[0].shape)
    print("Attention Masks Shape:", batch[1].shape)
    print("Labels Shape:", batch[2].shape)
    max_length = batch[0].shape[1]
    assert max_length <= 512, f"Error: Batch contains sequence of length {max_length}!"
    break

Small sample dataset created with 300 examples for pre-tuning evaluation.
Input IDs Shape: torch.Size([16, 512])
Attention Masks Shape: torch.Size([16, 512])
Labels Shape: torch.Size([16])


In [9]:
# Defining model evaluation function
def evaluate_model(model, dataloader):

    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss, logits = outputs.loss, outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Normalize loss
    avg_loss = total_loss / len(dataloader)

    # Compute overall metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

    return avg_loss, accuracy, precision, recall, f1

In [10]:
# Pre-tuning model evaluation against the sample data:
num_labels = len(label_mapping)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT", num_labels=num_labels
).to(device)


print("Starting pre-fine-tuning evaluation...\n")
avg_loss, accuracy, precision, recall, f1 = evaluate_model(model, sample_dataloader)
print('Evaluation Results:')
print(f'Loss: {avg_loss:.4f}')
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1: {f1 * 100:.2f}%')

Starting pre-fine-tuning evaluation...

Evaluation Results:
Loss: 3.3112
Accuracy: 4.00%
Precision: 0.18%
Recall: 4.00%
F1: 0.34%


In [11]:
# Model initialization with PEFT

model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

num_labels = len(label_mapping)

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=64,
    lora_alpha=128,
    lora_dropout=0.1
)

model = get_peft_model(base_model, lora_config).to(device)
#print(model)
model.print_trainable_parameters()


trainable params: 2,379,290 || all params: 110,709,556 || trainable%: 2.1491


In [12]:
# Defining model evaluation function
def evaluate_fine_tuning(model, dataloader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            # Access tensors using keys instead of numerical indices
            input_ids = batch["input_ids"].to(device)
            attention_masks = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss, logits = outputs.loss, outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Normalize loss
    avg_loss = total_loss / len(dataloader)

    # Compute overall metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

    return avg_loss, accuracy, precision, recall, f1

In [17]:
# Fine-tuning the model
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn as nn

save_directory = './dsm_finetune'
os.makedirs(save_directory, exist_ok=True)

criterion = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay = 0.01)

total_steps = len(train_loader) * 10
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.9)

scaler = GradScaler()

epochs = 10
patience = 2
no_improvement = 0
best_f1 = 0

print("Starting training...")

for epoch in range(epochs):
    if no_improvement >= patience:
        print("Early stopping triggered.")
        break

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_loader):
        # Access tensors from the batch dictionary
        input_ids = batch["input_ids"].to(device)
        attention_masks = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # Mixed precision forward pass
        with autocast():
            outputs = model(input_ids, attention_mask=attention_masks)
            loss = criterion(outputs.logits, labels)

        total_train_loss += loss.item()

        # Mixed precision backward pass
        scaler.scale(loss).backward()

        clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        scaler.step(optimizer)
        scaler.update()

    # Evaluate the model progress
    avg_train_loss = total_train_loss / len(train_loader)
    avg_loss, accuracy, precision, recall, f1 = evaluate_fine_tuning(model, val_loader)
    scheduler.step(avg_loss)

    # Print evaluation results for test sets
    print(f'\nEpoch {epoch + 1} results:')
    print(f'Loss:{avg_loss:.4f} Accuracy:{accuracy * 100:.2f}% Precision:{precision * 100:.2f}% Recall:{recall * 100:.2f}% F1:{f1 * 100:.2f}%')

    # Check for best F1 and save the best model
    if f1 > best_f1:
        best_f1 = f1
        no_improvement = 0

        # Save model weights
        model_save_path = os.path.join(save_directory, 'psyai.pt')
        torch.save(model.state_dict(), model_save_path)

        # Save tokenizer and config
        tokenizer.save_pretrained(save_directory)
        config = AutoConfig.from_pretrained(model_name)
        config.save_pretrained(save_directory)

        with open(os.path.join(save_directory, 'int_to_label.json'), 'w') as f:
            json.dump(label_mapping, f)

        # Save training metadata
        training_metadata = {
            "learning_rate": optimizer.param_groups[0]['lr'],
            "epochs": epochs,
            "batch_size": train_loader.batch_size,
            "total_steps": total_steps,
            "best_f1": best_f1,
            "gradient_clipping": 1.0,
            "weight_decay": optimizer.param_groups[0].get('weight_decay', 0),
            "scheduler": str(scheduler.__class__.__name__)
        }
        with open(os.path.join(save_directory, 'training_metadata.json'), 'w') as f:
            json.dump(training_metadata, f)

        # Save evaluation metrics
        best_metrics = {
            "f1_score": best_f1,
            "precision": precision,
            "recall": recall,
            "accuracy": accuracy,
        }
        with open(os.path.join(save_directory, 'evaluation_metrics.json'), 'w') as f:
            json.dump(best_metrics, f)

    else:
        no_improvement += 1

print("Training completed. Best model saved.")

Starting training...

Epoch 1 results:
Loss:0.9271 Accuracy:77.11% Precision:78.94% Recall:77.11% F1:76.60%

Epoch 2 results:
Loss:0.9153 Accuracy:76.37% Precision:77.89% Recall:76.37% F1:75.80%

Epoch 3 results:
Loss:0.9034 Accuracy:76.87% Precision:78.63% Recall:76.87% F1:76.30%
Early stopping triggered.
Training completed. Best model saved.


In [77]:
# Evaluating model on the test set:
best_model_path = "dsm_finetune/psyai.pt"
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT", num_labels=num_labels
)
model.load_state_dict(torch.load(best_model_path))
model.to(device)

print("Starting pre-fine-tuning evaluation...\n")
avg_loss, accuracy, precision, recall, f1 = evaluate_fine_tuning(model, test_loader)
print('Evaluation Results:')
print(f'Loss: {avg_loss:.4f}')
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1: {f1 * 100:.2f}%')

Starting pre-fine-tuning evaluation...

Evaluation Results:
Loss: 0.8915
Accuracy: 92.38%
Precision: 92.74%
Recall: 92.38%
F1: 92.36%


In [81]:
def predict_diagnosis(text, diagnosis_labels):
    """Predicts the mental health diagnosis based on the given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax().item()
    predicted_diagnosis = diagnosis_labels[predicted_class_id]
    return predicted_diagnosis

diagnosis_labels = list(label_mapping.keys())

# Example usage
text = "I've been feeling down and hopeless for the past few weeks."
predicted_diagnosis = predict_diagnosis(text, diagnosis_labels)
print(f"Predicted Diagnosis: {predicted_diagnosis}")

Predicted Diagnosis: depressive disorders


In [83]:
text = "I've been feeling mind-body dualism for the past few weeks."
predicted_diagnosis = predict_diagnosis(text, diagnosis_labels)
print(f"Predicted Diagnosis: {predicted_diagnosis}")

Predicted Diagnosis: paraphilic disorders


In [105]:
from fuzzywuzzy import fuzz, process

def predict_diagnosis(text, diagnosis_labels, df, max_length=512):
    """Predicts the mental health diagnosis and extracts relevant information using fuzzy matching."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax().item()
    predicted_diagnosis = diagnosis_labels[predicted_class_id]

    # Fuzzy matching to find the closest text in the DataFrame
    best_match = process.extractOne(text, df['text'], scorer=fuzz.token_sort_ratio)

    if best_match and best_match[1] > 80:  # Set a similarity threshold (e.g., 80)
        matching_row_index = df[df['text'] == best_match[0]].index[0]
        category = df.loc[matching_row_index, 'category']
        summary = df.loc[matching_row_index, 'text'][:100] + "..."
    else:
        category = "Unknown"
        summary = "Not found"

    return predicted_diagnosis, category, summary

# Create diagnosis_labels from label_mapping
diagnosis_labels = list(label_mapping.keys())

# Example usage (assuming 'df' is your DataFrame)
text = "more recent population-based studies with a questionnaire-based strategy using dsm-5 diagnostic criteria"
predicted_diagnosis, category, summary = predict_diagnosis(text, diagnosis_labels, data)
print(f"Predicted Diagnosis: {predicted_diagnosis}")
print(f"Category: {category}")
print(f"Summary: {summary}")

Predicted Diagnosis: depressive disorders
Category: Unknown
Summary: Not found
