In [1]:
!pip install transformers tiktoken protobuf sentencepiece 'accelerate>=0.26.0' gdown tqdm pandas

Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting protobuf
  Downloading protobuf-6.30.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting accelerate>=0.26.0
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model_name = "xz97/AlpaCare-llama2-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto"  # Automatically determine device mapping
)

# Add pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
    model.config.pad_token_id = tokenizer.pad_token_id

print("Model and tokenizer loaded successfully!")

Using device: cuda


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


Model and tokenizer loaded successfully!


In [9]:
import gdown
import json
import pandas as pd
import os
import zipfile
from tqdm import tqdm

# Download MedQA dataset
url = "https://drive.google.com/file/d/1ImYUSLk9JbgHXOemfvyiDiirluZHPeQw/view"
output_zip = "medqa_dataset.zip"

if not os.path.exists(output_zip):
    print("Downloading MedQA dataset...")
    gdown.download(url=url, output=output_zip, quiet=False, fuzzy=True)
else:
    print("MedQA dataset already downloaded.")

# Extract the zip file
if not os.path.exists("medqa"):
    print("Extracting MedQA dataset...")
    with zipfile.ZipFile(output_zip, 'r') as zip_ref:
        zip_ref.extractall(".")
    print("Extraction complete.")
else:
    print("MedQA dataset already extracted.")

# Define paths to the dataset files
test_path = "data_clean/questions/US/test.jsonl"
dev_path = "data_clean/questions/US/dev.jsonl"
train_path = "data_clean/questions/US/train.jsonl"

# Function to load JSONL files
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Load the datasets
test_data = load_jsonl(test_path)
dev_data = load_jsonl(dev_path)
train_data = load_jsonl(train_path)

print(f"Loaded MedQA datasets:")
print(f"Test: {len(test_data)} questions")
print(f"Dev: {len(dev_data)} questions")
print(f"Train: {len(train_data)} questions")

MedQA dataset already downloaded.
Extracting MedQA dataset...
Extraction complete.
Loaded MedQA datasets:
Test: 1273 questions
Dev: 1272 questions
Train: 10178 questions


In [10]:
def format_options(options_dict):
    """Format options dictionary into a string."""
    options_str = ""
    for key, value in options_dict.items():
        options_str += f"{key}. {value}\n"
    return options_str.strip()

def create_medqa_prompt(question, options):
    """Create a prompt for the MedQA task."""
    formatted_options = format_options(options)
    
    prompt = f"""You are a medical expert. Answer the following multiple-choice medical question by selecting the most appropriate option.

Question: {question}

Options:
{formatted_options}

Select the correct answer by providing the letter (A, B, C, D, or E) corresponding to the most appropriate option. You need not elaborate on your thinking.

Answer:"""
    
    return prompt

In [11]:
def generate_answer(prompt, max_new_tokens=512):
    """Generate an answer using the model."""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Use greedy decoding for deterministic results
            temperature=0.1,  # Low temperature for more focused responses
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the newly generated part (remove the prompt)
    answer = generated_text[len(prompt):]
    
    return answer.strip()

In [6]:
!pip install openai tenacity

Collecting openai
  Downloading openai-1.68.2-py3-none-any.whl.metadata (25 kB)
Collecting tenacity
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.7-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.

In [15]:
import torch
from tenacity import retry, stop_after_attempt, wait_exponential
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def extract_answer_option_with_local_model(response, model=model, tokenizer=tokenizer, device=device):
    """Extract the answer option using a simpler prompt for the local model."""
    try:
        # Create a simpler prompt that's more likely to get a direct answer
        prompt = f"""
        Question: Which multiple-choice option (A, B, C, D, or E) is selected in this text: "{response}"
        
        Answer with just the letter:
        """
        
        # Tokenize the prompt
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        # Generate the extraction
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids,
                max_new_tokens=5,     # We only need a very short response
                do_sample=False,      # Use greedy decoding for deterministic results
                temperature=0.1,      # Low temperature for more focused responses
                pad_token_id=tokenizer.pad_token_id
            )
        
        # Decode the generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the newly generated part (remove the prompt)
        extracted_text = generated_text[len(prompt):].strip()
        
        # Check for valid options in the first few characters
        valid_options = ['A', 'B', 'C', 'D', 'E']
        for char in extracted_text[:10]:  # Look at first 10 chars only
            if char in valid_options:
                return char
                
        # If no valid option found, fall back
        return extract_answer_option_fallback(response)
            
    except Exception as e:
        print(f"Error using local model for extraction: {e}")
        return extract_answer_option_fallback(response)
        
def extract_answer_option_fallback(response):
    """Original character-matching function as a fallback."""
    valid_options = ['A', 'B', 'C', 'D', 'E']
    
    # First, check if the response starts with a valid option
    first_word = response.split()[0] if response else ""
    if first_word in valid_options:
        return first_word
    
    # Look for patterns like "Answer: A" or "The answer is B"
    for option in valid_options:
        patterns = [
            f"Answer: {option}",
            f"answer: {option}",
            f"The answer is {option}",
            f"the answer is {option}",
            f"Option {option}",
            f"option {option}",
            f"Answer {option}",
            f"answer {option}"
        ]
        
        for pattern in patterns:
            if pattern in response:
                return option
    
    # If no clear pattern is found, look for the first occurrence of any valid option
    for char in response:
        if char in valid_options:
            return char
    
    # If no option is found, return None
    return None

def extract_answer_option(response):
    return extract_answer_option_with_local_model(response)


In [16]:
def evaluate_on_medqa(data, num_samples=None):
    """Evaluate the model on MedQA dataset."""
    if num_samples:
        # Sample a subset if specified
        import random
        random.seed(42)
        data = random.sample(data, min(num_samples, len(data)))
    
    correct = 0
    total = len(data)
    results = []
    
    for item in tqdm(data, desc="Evaluating"):
        question = item['question']
        options = item['options']
        correct_answer = item['answer_idx']  # This is the index (e.g., 'C')
        
        # Create prompt
        prompt = create_medqa_prompt(question, options)
        
        # Generate answer
        model_response = generate_answer(prompt)
        
        # Extract answer option using the new method
        predicted_option = extract_answer_option(model_response)
        
        # Check if correct
        is_correct = predicted_option == correct_answer
        if is_correct:
            correct += 1
        
        # Store result
        results.append({
            'question': question,
            'options': options,
            'correct_answer': correct_answer,
            'model_response': model_response,
            'predicted_option': predicted_option,
            'is_correct': is_correct
        })
    
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f} ({correct}/{total})")
    
    return accuracy, results

In [None]:
# Run evaluation on a small subset first to test
test_size = 800  # Start with a small number for testing
accuracy, results = evaluate_on_medqa(test_data, num_samples=test_size)

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("medqa_evaluation_results.csv", index=False)
print("Results saved to medqa_evaluation_results.csv")

Evaluating:  21%|██        | 165/800 [08:05<41:28,  3.92s/it] 