<a href="https://colab.research.google.com/github/gotero/DeepLearningLifeSciences/blob/master/primed_llm___running.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 1. Uninstall the conflicting Google/TensorFlow monitoring tools
# We do not need these for PyTorch LLM training
!pip uninstall -y opentelemetry-api opentelemetry-sdk opentelemetry-proto google-generativeai tensorflow-metadata grpcio-status

# 2. Uninstall our core libraries to start fresh
!pip uninstall -y transformers peft trl accelerate bitsandbytes protobuf sentence-transformers

# 3. Install the "Golden Compatibility" Stack
# We install these together so pip can try to resolve them as a group
!pip install -q \
    "protobuf==3.20.3" \
    "torch" \
    "transformers==4.40.0" \
    "peft==0.10.0" \
    "bitsandbytes==0.43.1" \
    "accelerate==0.29.3" \
    "trl==0.8.6" \
    "sentence-transformers==2.7.0" \
    "pandas" \
    "numpy" \
    "pandera" \
    "ydata-profiling" \
    "scikit-learn" \
    "skrub" \
    "openpyxl" \
    "scipy"

# 4. Force install Triton (Critical for GPU)
!pip install -q -U triton

Found existing installation: opentelemetry-api 1.37.0
Uninstalling opentelemetry-api-1.37.0:
  Successfully uninstalled opentelemetry-api-1.37.0
Found existing installation: opentelemetry-sdk 1.37.0
Uninstalling opentelemetry-sdk-1.37.0:
  Successfully uninstalled opentelemetry-sdk-1.37.0
Found existing installation: opentelemetry-proto 1.37.0
Uninstalling opentelemetry-proto-1.37.0:
  Successfully uninstalled opentelemetry-proto-1.37.0
Found existing installation: google-generativeai 0.8.5
Uninstalling google-generativeai-0.8.5:
  Successfully uninstalled google-generativeai-0.8.5
Found existing installation: tensorflow-metadata 1.17.2
Uninstalling tensorflow-metadata-1.17.2:
  Successfully uninstalled tensorflow-metadata-1.17.2
Found existing installation: grpcio-status 1.71.2
Uninstalling grpcio-status-1.71.2:
  Successfully uninstalled grpcio-status-1.71.2
Found existing installation: transformers 4.57.3
Uninstalling transformers-4.57.3:
  Successfully uninstalled transformers-4.57

In [3]:
# 1. Uninstall the broken, old version
!pip uninstall -y bitsandbytes

# 2. Install the latest version which supports Python 3.12 and Triton 3.x
!pip install -q -U bitsandbytes

Found existing installation: bitsandbytes 0.43.1
Uninstalling bitsandbytes-0.43.1:
  Successfully uninstalled bitsandbytes-0.43.1
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m170.5/170.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import torch
import bitsandbytes as bnb

print(f"CUDA: {torch.cuda.is_available()}")
try:
    print(f"BnB Version: {bnb.__version__}")
    # The new version might rely on dynamic loading, so we just check import
    print("‚úÖ Success: Bitsandbytes loaded!")
except Exception as e:
    print(f"‚ùå Error: {e}")

CUDA: True
BnB Version: 0.48.2
‚úÖ Success: Bitsandbytes loaded!


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [8]:
%%writefile clean_data.py
import pandas as pd
import numpy as np
import argparse
import sys
import os
import logging
from sklearn.impute import KNNImputer
from skrub import deduplicate

logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

def standardize_nulls_and_drop_empty(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("--- Step 1: Standardizing Nulls ---")
    null_placeholders = ["", " ", "NA", "N/A", "null", "NULL", "-", "nan", "NaN"]
    df.replace(null_placeholders, np.nan, inplace=True)

    empty_cols = df.columns[df.isnull().all()].tolist()
    if empty_cols:
        logger.info(f"Dropping empty columns: {empty_cols}")
        df.drop(columns=empty_cols, inplace=True)
    return df

def fix_gene_excel_errors(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("--- Step 2: Fixing Gene Name Errors ---")
    excel_gene_map = {
        "1-Mar": "MARCH1", "1-Sep": "SEPT1", "2-Mar": "MARCH2", "2-Sep": "SEPT2",
        "3-Mar": "MARCH3", "3-Sep": "SEPT3", "4-Mar": "MARCH4", "4-Sep": "SEPT4",
        "5-Mar": "MARCH5", "5-Sep": "SEPT5", "6-Mar": "MARCH6", "6-Sep": "SEPT6",
        "7-Mar": "MARCH7", "7-Sep": "SEPT7", "8-Mar": "MARCH8", "8-Sep": "SEPT8",
        "9-Mar": "MARCH9", "9-Sep": "SEPT9", "10-Mar": "MARCH10", "10-Sep": "SEPT10",
    }
    for col in df.select_dtypes(include='object').columns:
        if df[col].astype(str).isin(excel_gene_map.keys()).any():
            df[col] = df[col].replace(excel_gene_map)
    return df

def impute_missing_data(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("--- Step 3: Imputing Missing Data ---")
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols: return df

    # Simple check if imputation is needed
    if df[numeric_cols].isnull().sum().sum() > 0:
        try:
            imputer = KNNImputer(n_neighbors=5)
            df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
        except Exception as e:
            logger.warning(f"Imputation skipped due to error: {e}")
    return df

def save_cleaned_data(df: pd.DataFrame, output_path: str):
    df.to_csv(output_path, index=False)
    logger.info(f"‚úÖ Cleaned data saved to: {output_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input_file", help="Path to input .csv or .xlsx file")
    args = parser.parse_args()

    input_path = args.input_file
    file_dir = os.path.dirname(input_path) or "."
    file_name = os.path.basename(input_path)
    file_root, file_ext = os.path.splitext(file_name)
    output_path = os.path.join(file_dir, f"cleaned_{file_root}.csv")

    logger.info(f"Processing {input_path}...")
    try:
        if file_ext.lower() in ['.xlsx', '.xls']:
            df = pd.read_excel(input_path)
        else:
            df = pd.read_csv(input_path)
    except Exception as e:
        logger.error(f"‚ùå Error loading file: {e}")
        sys.exit(1)

    df = standardize_nulls_and_drop_empty(df)
    df = fix_gene_excel_errors(df)
    df = impute_missing_data(df)
    save_cleaned_data(df, output_path)

Writing clean_data.py


In [9]:
%%writefile calculate_scores.py
import pandas as pd
import numpy as np
import sys
import os

def calculate_and_export_scores(input_file):
    if not os.path.exists(input_file):
        sys.exit(f"Error: File {input_file} not found.")

    df = pd.read_csv(input_file)
    calc_df = df.copy()

    # Map Y/N to 1/0 for calculation
    def map_binary(val):
        if isinstance(val, str):
            v = val.lower().strip()
            if v in ['y', 'yes']: return 1
            if v in ['n', 'no']: return 0
        return 0

    binary_cols = ['Prior VTE', 'Active Cancer', 'Recent Surgery/Immobilization',
                   'Estrogen Use', 'Hemoptysis', 'Leg Swelling/Tenderness', 'Alternative Dx Less Likely']

    for col in binary_cols:
        if col in calc_df.columns:
            calc_df[col] = calc_df[col].apply(map_binary)
        else:
            calc_df[col] = 0

    # Ensure numeric columns
    for col in ['Age', 'HR', 'SpO‚ÇÇ (RA)']:
        calc_df[col] = pd.to_numeric(calc_df[col], errors='coerce').fillna(0)

    # 1. Wells Score
    wells = (
        (calc_df['Leg Swelling/Tenderness'] * 3.0) + (calc_df['Alternative Dx Less Likely'] * 3.0) +
        ((calc_df['HR'] > 100).astype(int) * 1.5) + (calc_df['Recent Surgery/Immobilization'] * 1.5) +
        (calc_df['Prior VTE'] * 1.5) + (calc_df['Hemoptysis'] * 1.0) + (calc_df['Active Cancer'] * 1.0)
    )

    # 2. Geneva Score
    geneva_hr = calc_df['HR'].apply(lambda x: 3 if 75 <= x <= 94 else (5 if x > 94 else 0))
    geneva = (
        ((calc_df['Age'] > 65).astype(int) * 1) + (calc_df['Prior VTE'] * 3) +
        (calc_df['Recent Surgery/Immobilization'] * 2) + (calc_df['Active Cancer'] * 2) +
        (calc_df['Leg Swelling/Tenderness'] * 3) + (calc_df['Hemoptysis'] * 2) + geneva_hr
    )

    # 3. PERC Rule
    perc = (
        ((calc_df['Age'] >= 50).astype(int)) + ((calc_df['HR'] >= 100).astype(int)) +
        ((calc_df['SpO‚ÇÇ (RA)'] < 95).astype(int)) + calc_df['Leg Swelling/Tenderness'] +
        calc_df['Hemoptysis'] + calc_df['Recent Surgery/Immobilization'] +
        calc_df['Prior VTE'] + calc_df['Estrogen Use']
    )

    # Add scores to original dataframe
    df['Wells Score'] = wells
    df['Geneva Score'] = geneva
    df['PERC Rule'] = perc

    # Generate dynamic output filename: cleaned_filename_with_scores.csv
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_with_scores{ext}"

    df.to_csv(output_file, index=False)
    print(f"Scored data saved to: {output_file}")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.exit("Usage: python calculate_scores.py <input_csv>")
    calculate_and_export_scores(sys.argv[1])

Writing calculate_scores.py


In [10]:
%%writefile preprocess_data.py
import pandas as pd
import json
import sys
import os
from sklearn.model_selection import train_test_split

def create_training_data(input_file):
    if not os.path.exists(input_file):
        sys.exit(f"Error: File {input_file} not found.")

    df = pd.read_csv(input_file)

    # Dynamic Feature Extraction
    target_column = 'CTPA Result'
    ignore_cols = ['Subject', 'Study ID', 'Encounter Date', target_column]

    feature_columns = [c for c in df.columns if c not in ignore_cols]

    def format_instruction(row):
        # 1. Build the Input String
        features = []
        for col in feature_columns:
            val = row[col]
            if pd.notna(val) and str(val).strip() != "":
                features.append(f"{col}: {val}")

        input_text = "\n".join(features)
        output_text = str(row[target_column]) if pd.notna(row[target_column]) else "Unknown"

        # 2. Create the FULL PROMPT immediately
        # This matches the BioMistral/Alpaca format
        full_text = (
            f"### Instruction: Analyze the patient clinical data and calculate risk scores to predict the CTPA result.\n"
            f"### Input: {input_text}\n"
            f"### Output: {output_text}"
        )

        return {
            "text": full_text,  # <--- The Trainer looks for this specific key
            "instruction": "Analyze the patient clinical data...",
            "input": input_text,
            "output": output_text
        }

    dataset = df.apply(format_instruction, axis=1).tolist()
    train_data, val_data = train_test_split(dataset, test_size=0.1, random_state=42)

    with open('train_data.jsonl', 'w') as f:
        for entry in train_data:
            json.dump(entry, f); f.write('\n')

    with open('val_data.jsonl', 'w') as f:
        for entry in val_data:
            json.dump(entry, f); f.write('\n')

    print(f"‚úÖ Preprocessing complete. Training samples: {len(train_data)}")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.exit("Usage: python preprocess_data.py <input_csv>")
    create_training_data(sys.argv[1])

Writing preprocess_data.py


In [12]:
import glob
import os
import subprocess

def run_auto_pipeline():
    print("üöÄ Starting Automated Clinical Pipeline...")

    # 1. Auto-detect Input File
    # Finds any CSV or XLSX that isn't a result file from a previous run
    extensions = ["*.csv", "*.xlsx"]
    potential_files = []
    for ext in extensions:
        potential_files.extend(glob.glob(ext))

    # Filter out files generated by the pipeline itself to avoid loops
    input_files = [
        f for f in potential_files
        if "cleaned_" not in f
        and "_with_scores" not in f
        and "patient_scores_calculated" not in f
    ]

    if not input_files:
        print("‚ùå No valid input file found! Please upload a .csv or .xlsx file.")
        return

    # Select the most recently uploaded file
    raw_input_file = max(input_files, key=os.path.getmtime)
    print(f"üìÇ Detected Input File: {raw_input_file}")

    # 2. Run Cleaning
    print(f"--- 1. Cleaning {raw_input_file} ---")
    if subprocess.call(f"python clean_data.py \"{raw_input_file}\"", shell=True) != 0: return

    # Determine cleaned filename (matches logic in clean_data.py)
    file_root, _ = os.path.splitext(raw_input_file)
    cleaned_file = f"cleaned_{file_root}.csv"

    # 3. Run Scoring
    print(f"--- 2. Scoring {cleaned_file} ---")
    if subprocess.call(f"python calculate_scores.py \"{cleaned_file}\"", shell=True) != 0: return

    # Determine scored filename (matches logic in calculate_scores.py)
    base_cleaned, ext = os.path.splitext(cleaned_file)
    scored_file = f"{base_cleaned}_with_scores{ext}"

    # 4. Run Preprocessing
    print(f"--- 3. Preprocessing {scored_file} ---")
    if subprocess.call(f"python preprocess_data.py \"{scored_file}\"", shell=True) != 0: return

    print("\n‚úÖ Pipeline Finished! 'train_data.jsonl' is ready for fine-tuning.")

run_auto_pipeline()

üöÄ Starting Automated Clinical Pipeline...
üìÇ Detected Input File: simulated_primed_data_1k.csv
--- 1. Cleaning simulated_primed_data_1k.csv ---
--- 2. Scoring cleaned_simulated_primed_data_1k.csv ---
--- 3. Preprocessing cleaned_simulated_primed_data_1k_with_scores.csv ---

‚úÖ Pipeline Finished! 'train_data.jsonl' is ready for fine-tuning.


In [11]:
%%writefile finetune_clinical.py
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer

MODEL_NAME = "BioMistral/BioMistral-7B"
NEW_MODEL_NAME = "BioMistral-Clinical-Finetuned"

# Load Data
dataset = load_dataset("json", data_files={"train": "train_data.jsonl", "validation": "val_data.jsonl"})

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load Model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]
)

model = get_peft_model(model, peft_config)

# Training Args
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

# SFT Trainer (Simplified)
# We removed formatting_func because 'train_data.jsonl' now has a 'text' column
# that contains the fully formatted prompt.
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_args,
    max_seq_length=1024,
    dataset_text_field="text",  # <--- Points to the pre-formatted column
    packing=False
)

print("Starting training...")
trainer.train()

print("Saving model...")
trainer.model.save_pretrained(NEW_MODEL_NAME)
tokenizer.save_pretrained(NEW_MODEL_NAME)
print(f"Model saved to {NEW_MODEL_NAME}")

Writing finetune_clinical.py


In [None]:
!python finetune_clinical.py

2025-12-09 08:13:03.281773: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765267983.315331   14333 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765267983.325653   14333 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765267983.351024   14333 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765267983.351061   14333 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765267983.351067   14333 computation_placer.cc:177] computation placer alr

In [None]:
%%writefile merge_model.py
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

BASE_MODEL = "BioMistral/BioMistral-7B"
ADAPTER_DIR = "BioMistral-Clinical-Finetuned"
OUTPUT_DIR = "merged_clinical_model"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, low_cpu_mem_usage=True, return_dict=True, torch_dtype=torch.float16, device_map="auto"
)
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model = model.merge_and_unload()

model.save_pretrained(OUTPUT_DIR)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Merged model saved to {OUTPUT_DIR}")

In [None]:
!python merge_model.py

In [None]:
%%writefile agent_interaction.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "merged_clinical_model"
print(f"Loading model from {model_path}...")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)

def query_clinical_model(input_text):
    prompt = (
        f"### Instruction: Analyze the patient clinical data and predict the CTPA result.\n"
        f"### Input: {input_text}\n"
        f"### Output:"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=50, temperature=0.1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Output:")[-1].strip()

print("\n--- Clinical Agent Ready ---")
print("Enter patient data (e.g., 'Age: 65, HR: 110, Chest Pain: Yes'). Type 'exit' to quit.\n")

while True:
    user_input = input("Patient Data: ")
    if user_input.lower() in ['exit', 'quit']: break

    print("Analysis: ", query_clinical_model(user_input))
    print("-" * 30)

In [None]:
!python agent_interaction.py