# Dakitari-Instruct Model Training in Google Colab

This notebook will help you clone the repository, train the model, and save results to Google Drive.

In [1]:
from google.colab import drive
import os
import sys

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Set up project directories
PROJECT_DIR = '/content/drive/MyDrive/Dakitari-Instruct-v1.5'
DATASET_DIR = os.path.join(PROJECT_DIR, 'datasets')
MODEL_CHECKPOINT_DIR = os.path.join(PROJECT_DIR, 'checkpoints')

# Create directories if they don't exist
os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(MODEL_CHECKPOINT_DIR, exist_ok=True)

print(f"Project Directory: {PROJECT_DIR}")
print(f"Dataset Directory: {DATASET_DIR}")
print(f"Model Checkpoint Directory: {MODEL_CHECKPOINT_DIR}")

In [12]:
# Clone repository with overwrite option
import os
import shutil
import subprocess

REPO_URL = "https://github.com/elijahnzeli1/Dakitari-Instruct-v1.5.git"
REPO_NAME = "Dakitari-Instruct-v1.5"

# Change to content directory
%cd /content

# Check if repository already exists
if os.path.exists(REPO_NAME):
    print(f"Repository {REPO_NAME} already exists.")

    # Option 1: Remove existing repository
    try:
        shutil.rmtree(REPO_NAME)
        print(f"Existing {REPO_NAME} directory removed.")
    except Exception as e:
        print(f"Error removing existing directory: {e}")

# Clone the repository
try:
    # Use subprocess for more detailed git clone output
    result = subprocess.run(
        ["git", "clone", REPO_URL],
        capture_output=True,
        text=True,
        check=True
    )
    print("Repository cloned successfully.")
    print(result.stdout)
except subprocess.CalledProcessError as e:
    print(f"Error cloning repository: {e}")
    print("Error output:", e.stderr)

# Change to repository directory
%cd {REPO_NAME}

In [None]:
!pip install -r requirements.txt

In [4]:
# Update the dataset download cell in the notebook
import sys
import os

# Add the full path to the project directory
PROJECT_ROOT = '/content/Dakitari-Instruct-v1.5'
sys.path.append(PROJECT_ROOT)

# Dynamically import the preprocess module
import importlib.util

def import_module_from_path(module_name, file_path):
    """
    Dynamically import a module from a specific file path
    """
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    spec.loader.exec_module(module)
    return module

# Dynamically import MedicalDataProcessor
preprocess_path = os.path.join(PROJECT_ROOT, 'data', 'preprocess.py')
try:
    preprocess_module = import_module_from_path('preprocess', preprocess_path)
    MedicalDataProcessor = preprocess_module.MedicalDataProcessor
except Exception as e:
    print(f"Error importing MedicalDataProcessor: {e}")

    # Fallback implementation if import fails
    class MedicalDataProcessor:
        def __init__(
            self,
            max_length: int = 512,
            batch_size: int = 32,
            tokenizer_name: str = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
        ):
            import tensorflow as tf
            import random

            self.max_length = max_length
            self.batch_size = batch_size

        def prepare_medical_corpus(self, save_dir: str = "processed_data", split: str = 'train'):
            """Prepare medical corpus for training or testing"""
            # Create save directory
            os.makedirs(save_dir, exist_ok=True)

            # Try to load PubMed dataset
            try:
                # Load dataset with specified split
                if split == 'train':
                    dataset = load_dataset("pubmed_qa", "pqa_unlabeled", split="train")
                elif split == 'test':
                    dataset = load_dataset("pubmed_qa", "pqa_labeled", split="test")
                else:
                    dataset = load_dataset("pubmed_qa", "pqa_unlabeled", split="validation")

                print(f"Successfully loaded PubMed QA {split} dataset")
            except Exception as e:
                print(f"Failed to load dataset: {e}")
                return None, 1001  # Placeholder return values

            # Extract texts from the dataset
            texts = [
                str(item['context']) if item.get('context') else str(item.get('question', ''))
                for item in dataset
            ]

            # Preprocess texts
            processed_data = self.preprocess_text(texts)

            # Convert to TensorFlow dataset
            tf_dataset = tf.data.Dataset.from_tensor_slices({
                "input_ids": processed_data["input_ids"],
                "attention_mask": processed_data["attention_mask"]
            })

            # Batch and prefetch
            tf_dataset = tf_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)

            return tf_dataset, self.tokenizer.vocab_size if self.tokenizer else 1001

# Set up dataset processor
data_processor = MedicalDataProcessor(
    max_length=512,
    batch_size=32,
    tokenizer_name="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
)

# Attempt to load and preprocess the dataset
try:
    # This will use the Hugging Face datasets library to load PubMedQA
    dataset, vocab_size = data_processor.prepare_medical_corpus(
        save_dir=os.path.join(DATASET_DIR, 'processed')  # Assuming DATASET_DIR is defined elsewhere
    )

    print(f"Dataset loaded successfully. Vocabulary size: {vocab_size}")
    print(f"Dataset shape: {dataset}")

except Exception as e:
    print(f"Failed to load dataset: {e}")

In [11]:
# Train the model with Google Drive checkpoint saving
!python train.py \
    --batch_size 32 \
    --epochs 10 \
    --max_length 512 \
    --embed_dim 256 \
    --num_heads 8 \
    --ff_dim 512 \
    --num_transformer_blocks 6 \
    --dropout_rate 0.1 \
    --learning_rate 1e-4 \
    --checkpoint_dir {MODEL_CHECKPOINT_DIR} \
    --log_dir {MODEL_CHECKPOINT_DIR}/logs

## DOWNLOAD OF THE TRAINED MODEL

In [None]:
# Create a zip file of the trained model
!zip -r dakitari_instruct_v1.0.zip {MODEL_CHECKPOINT_DIR}/Dakitari-instruct-v1.0

# Download the model
from google.colab import files
files.download('dakitari_instruct_v1.0.zip')

print("Model has been zipped and downloaded. The zip file contains all necessary files:")
!ls -l {MODEL_CHECKPOINT_DIR}/Dakitari-instruct-v1.0

## EVALUATE THE MODEL

In [None]:
# Evaluate the trained model
!python evaluate.py \
    --model_path {os.path.join(MODEL_CHECKPOINT_DIR, 'Dakitari-instruct-v1.0/model.safetensors')} \
    --max_length 512 \
    --batch_size 32

## Additional Notes
- Ensure you have the correct GitHub repository URL
- Check that all dependencies are in `requirements.txt`
- Modify hyperparameters as needed