# Human Anatomy Chatbot with BERT

This notebook implements a chatbot for human anatomy using BERT and Flask. Follow the cells in order to:
1. Set up the environment
2. Load and prepare data
3. Train the model
4. Launch the API server

In [None]:
# Import Required Libraries
import json
import os
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer
)
from datasets import Dataset
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from flask import Flask, request, jsonify
from typing import Dict, Any
import time
import re
import shutil

# Check if CUDA is available
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")

In [None]:
class AnatomyChatbot:
    def __init__(self, model_name: str = "bert-base-uncased", model_dir: str = "./final_anatomy_model"):
        self.model_name = model_name
        self.model_dir = model_dir
        self.initialize_model()
        self.qa_pairs = []
    
    def initialize_model(self):
        """Initialize or load the model and tokenizer"""
        try:
            if os.path.exists(self.model_dir):
                print(f"Loading model from {self.model_dir}")
                self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
                self.model = AutoModelForSequenceClassification.from_pretrained(
                    self.model_dir,
                    num_labels=1
                )
                print("Model loaded successfully")
            else:
                print("No saved model found. Initializing new model...")
                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
                self.model = AutoModelForSequenceClassification.from_pretrained(
                    self.model_name,
                    num_labels=1,
                    problem_type="regression"  # Specify regression problem
                )
                print("New model initialized")
                
        except Exception as e:
            print(f"Error during model initialization: {e}")
            print("Falling back to new model initialization...")
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForSequenceClassification.from_pretrained(
                self.model_name,
                num_labels=1,
                problem_type="regression"  # Specify regression problem
            )
            
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def tokenize_function(self, examples):
        """Tokenize the input text and properly format labels"""
        tokenized = self.tokenizer(
            examples['text'],
            padding="max_length",
            truncation=True,
            max_length=512
        )
        
        # Convert labels to float32 tensor and reshape
        tokenized['labels'] = torch.tensor(examples['labels'], dtype=torch.float32).reshape(-1, 1)
        return tokenized

    def prepare_datasets(self, dataset: pd.DataFrame, train_ratio: float = 0.8):
        """Prepare and process datasets for training"""
        # Split the dataset
        train_size = int(len(dataset) * train_ratio)
        train_data = dataset.iloc[:train_size]
        test_data = dataset.iloc[train_size:]
        
        # Convert to HuggingFace datasets
        train_dataset = Dataset.from_pandas(train_data)
        test_dataset = Dataset.from_pandas(test_data)
        
        # Map the tokenization function across the datasets
        train_dataset = train_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )
        
        test_dataset = test_dataset.map(
            self.tokenize_function,
            batched=True,
            remove_columns=test_dataset.column_names
        )
        
        print(f"Training set size: {len(train_dataset)}")
        print(f"Testing set size: {len(test_dataset)}")
        
        return train_dataset, test_dataset
    
    def load_and_prepare_dataset(self, folder_path: str) -> pd.DataFrame:
        """Load and prepare dataset from JSONL files"""
        all_data = []
        
        if not os.path.exists(folder_path):
            raise FileNotFoundError(f"Directory {folder_path} not found")
            
        jsonl_files = [f for f in os.listdir(folder_path) if f.endswith('.jsonl')]
        if not jsonl_files:
            raise FileNotFoundError(f"No JSONL files found in {folder_path}")
            
        for file_name in jsonl_files:
            with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
                for line in file:
                    data = json.loads(line)
                    if 'question' in data and 'answer' in data:
                        all_data.append({
                            'question': data['question'],
                            'answer': data['answer'],
                            'text': f"{data['question']} [SEP] {data['answer']}"
                        })
                        self.qa_pairs.append({
                            'question': data['question'],
                            'answer': data['answer']
                        })
        
        df = pd.DataFrame(all_data)
        df['labels'] = np.zeros(len(df))
        return df[['text', 'labels']]
    
    def split_dataset(self, df: pd.DataFrame, train_ratio: float = 0.8) -> tuple:
        """Split dataset into training and testing sets"""
        train_size = int(len(df) * train_ratio)
        train_data = df.iloc[:train_size]
        test_data = df.iloc[train_size:]
        return train_data, test_data
    
    
    def train_and_evaluate(self, train_dataset, test_dataset, force_train: bool = False) -> Dict[str, Any]:
        """Train if necessary and evaluate the model"""
        # Initialize model and tokenizer
        self.initialize_model()
        
        if force_train or not os.path.exists(self.model_dir):
            print("Starting model training...")
            training_args = TrainingArguments(
                output_dir="./training_outputs",
                eval_strategy="epoch",
                save_strategy="epoch",
                learning_rate=2e-5,
                per_device_train_batch_size=8,
                per_device_eval_batch_size=8,
                num_train_epochs=3,
                weight_decay=0.01,
                logging_dir="./training_logs",
                logging_steps=10,
                load_best_model_at_end=True,
                metric_for_best_model="loss",
                greater_is_better=False
            )
            
            # Define compute_metrics function: compute_metrics function that calculates Mean Squared Error (MSE) between predictions and labels
            def compute_metrics(eval_pred):
                logits, labels = eval_pred
                predictions = torch.sigmoid(torch.tensor(logits)).numpy()
                return {
                    "mse": ((predictions - labels) ** 2).mean().item()
                }
            
            # Configure Trainer with compute_metrics
            trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                compute_metrics=compute_metrics
            )
            
            # Train the model
            trainer.train()
            
            # Save the final model
            print(f"Saving model to {self.model_dir}")
            if os.path.exists(self.model_dir):
                shutil.rmtree(self.model_dir)
            trainer.save_model(self.model_dir)
            self.tokenizer.save_pretrained(self.model_dir)

            # Clean up temporary directories
            if os.path.exists("./training_outputs"):
                shutil.rmtree("./training_outputs")
            if os.path.exists("./training_logs"):
                shutil.rmtree("./training_logs")
        
        try:
            print("Evaluating model on test dataset...")
            # Evaluate the model
            trainer = Trainer(
                model=self.model,
                tokenizer=self.tokenizer,
                # compute_metrics=compute_metrics,
                eval_dataset=test_dataset
            )

            # Clean up temporary directories like tmp_trainer created by huggingface
            if os.path.exists("./tmp_trainer"):
                shutil.rmtree("./tmp_trainer")

            # Evaluate the model
            metrics = trainer.evaluate()
            
            # Assuming `trainer` provides predictions and true labels
            predictions = trainer.predict(test_dataset).predictions
            true_labels = test_dataset['labels']
            
            # Convert predictions to class indices if necessary
            predicted_labels = predictions.argmax(axis=-1)
            
            # Calculate accuracy
            accuracy = accuracy_score(true_labels, predicted_labels)
            
            # Add accuracy to metrics
            metrics['accuracy'] = accuracy

            return metrics

        except Exception as e:
            print(f"Error during Evaluation: {e}")

    def predict(self, question: str) -> Dict[str, Any]:
        """Find the most relevant answer for a given question"""
        start_time = time.time()
        
        # Ensure model is initialized
        if self.model is None or self.tokenizer is None:
            self.initialize_model()
        
        # Find the most similar question in our dataset
        best_match = None
        best_score = -1
        
        for qa_pair in self.qa_pairs:
            inputs = self.tokenizer(
                question,
                qa_pair['question'],
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model(**inputs)
                score = torch.sigmoid(outputs.logits).item()
                
            if score > best_score:
                best_score = score
                best_match = qa_pair
        
        if not best_match:
            return {
                "answer": "I couldn't find a relevant answer to your question.",
                "confidence": 0.0,
                "response_time": time.time() - start_time
            }
        
        return {
            "question": best_match['question'],
            "answer": best_match['answer'],
            "confidence": best_score,
            "response_time": time.time() - start_time
        }
# Initialize the chatbot
chatbot = AnatomyChatbot(model_dir="./final_anatomy_model")
print("Chatbot initialized successfully")

In [None]:
# Load and prepare the dataset
# Update this path to where your JSONL files are stored
DATA_DIR = "./data"

try:
    # Load dataset
    dataset = chatbot.load_and_prepare_dataset(DATA_DIR)
    print(f"Dataset loaded successfully. Shape: {dataset.shape}")
    
except Exception as e:
    print(f"Error loading dataset: {e}")

try:
    # Prepare the datasets (this handles splitting, conversion, and tokenization)
    train_dataset, test_dataset = chatbot.prepare_datasets(dataset)
    print("Dataset prepared")
except Exception as e:
    print(f"Dataset preparation error: {e}")

In [None]:
# Train the model
try:
    print("Starting model training...")
    metrics = chatbot.train_and_evaluate(train_dataset, test_dataset)
    print("\nTraining completed. Final metrics:")
    for key, value in metrics.items():
        print(f"{key}: {value}")
except Exception as e:
    print(f"Error during training: {e}")

In [None]:
# Dummy testing
result = chatbot.predict('What is anatomy?')
print(result)

In [None]:
# Flask API setup
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict_endpoint():
    try:
        data = request.json
        if not data or 'input' not in data:
            return jsonify({"error": "No input provided"}), 400
            
        result = chatbot.predict(data['input'])
        return jsonify(result)
        
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# Start the Flask server
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)

## Testing the API

Once the Flask server is running, you can test it using the following cell:

In [None]:
import requests

# Test the API
test_text = "The human heart has four chambers"
response = requests.post('http://localhost:5000/predict', json={'input': test_text})

print("API Response:")
print(response.json())