<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/NVIDIA_ToolOrchestra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U bitsandbytes trl unsloth
!pip install -q -U git+https://github.com/huggingface/transformers


In [2]:
import json

# Define specialized examples for your specific experts
data = [
    {
        "input": "Show me all employees who joined after 2022 from the 'staff' table.",
        "reasoning": "This is a structured data retrieval task requiring SQL syntax. I should delegate this to the text-to-sql expert model.",
        "action": "call_expert: sql_specialist"
    },
    {
        "input": "What are the ethical implications of sentient AI according to Kantianism?",
        "reasoning": "This is a complex philosophical inquiry requiring deep moral reasoning. I should route this to the philosophy specialist.",
        "action": "call_expert: philosophy_specialist"
    },
    {
        "input": "Transcribe the audio snippet from the meeting minutes.",
        "reasoning": "This is a narrow multi-modal transcription task. The transcription SLM is more efficient for this than a general LLM.",
        "action": "call_expert: transcription_specialist"
    },
    {
        "input": "What is the capital of France?",
        "reasoning": "This is a simple factual question that I can answer directly without calling an expensive expert.",
        "action": "self_answer"
    }
]

# Write to the file the trainer is looking for
with open("routing_data.jsonl", "w") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")

print("✅ 'routing_data.jsonl' created successfully!")

✅ 'routing_data.jsonl' created successfully!


In [3]:
def accuracy_reward(completions, answer, **kwargs):
    """Checks if the model picked the correct expert."""
    rewards = []
    for content, target in zip(completions, answer):
        reward = 1.0 if target in content else 0.0
        rewards.append(reward)
    return rewards

def efficiency_reward(completions, **kwargs):
    """Penalizes the model if it chooses a 'Giant' model for a simple task."""
    rewards = []
    for content in completions:
        # If the query is simple but the model calls a heavy expert, penalize
        if "capital of" in content.lower() and "call_expert" in content:
            rewards.append(-0.5)
        else:
            rewards.append(0.2) # Bonus for staying efficient
    return rewards

In [None]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

import os
os.environ["WANDB_DISABLED"] = "true"

# 1. Load Model and Tokenizer (Optimized for L4)
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/meta-llama-3.1-8b-instruct-bnb-4bit", # 4-bit is key for L4
    max_seq_length = max_seq_length,
    load_in_4bit = True,
)

# 2. Add LoRA Adapters (The trainable 'brain' part)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank: higher = more smarts, more VRAM. 16 is the sweet spot.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # Critical for lowering peak VRAM
)

# 3. Define the Orchestrator Prompt Template
orchestrator_prompt = """### Instruction:
You are an AI Orchestrator. Reason about the user's request and route it to the correct expert.

### Input:
{}

### Response:
### Reasoning
{}
### Action
{}"""

def formatting_prompts_func(examples):
    inputs       = examples["input"]
    reasonings   = examples["reasoning"]
    actions      = examples["action"]
    texts = []
    for input, reasoning, action in zip(inputs, reasonings, actions):
        text = orchestrator_prompt.format(input, reasoning, action) + tokenizer.eos_token
        texts.append(text)
    return { "text" : texts, }

# Load your custom synthetic routing dataset (JSONL)
dataset = load_dataset("json", data_files="routing_data.jsonl", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True)

# 4. Set Training Arguments (VRAM Optimized)
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer, # ADD THIS LINE TO FIX THE 'NoneType' ERROR
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
    ),
)

trainer.train()

In [None]:
!pip install mergekit -q
!pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo trl mergekit -q

In [None]:
!pip install vllm diffusers -q

In [None]:
!pip install pydantic==2.10.0 pydantic-settings -q

In [5]:
# 1. Update the formatting function for GRPO
def formatting_prompts_func(examples):
    inputs       = examples["input"]
    # GRPO specifically needs a 'prompt' column
    # We pass the raw instruction + input, and the model will generate the rest
    prompts = []
    for input_text in inputs:
        # Note: We do NOT include the 'Reasoning' or 'Action' here.
        # The model needs to generate those itself to get rewards!
        prompt = f"### Instruction:\nYou are an AI Orchestrator. Reason about the user's request and route it to the correct expert.\n\n### Input:\n{input_text}\n\n### Response:\n"
        prompts.append(prompt)
    return { "prompt" : prompts, } # CHANGED FROM 'text' TO 'prompt'

# 2. Reload and remap
dataset = load_dataset("json", data_files="routing_data.jsonl", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [7]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL() # This patches the TRL library for 2x faster RL
from trl import GRPOTrainer, GRPOConfig
from vllm import SamplingParams
import re

# 1. Define Reward Functions
def format_reward_func(prompts, completions, **kwargs):
    """Rewards the model for following the 'Reasoning' -> 'Action' structure."""
    responses = [re.findall(r"### Reasoning\n.*?\n### Action\n", c, re.DOTALL) for c in completions]
    return [1.0 if r else 0.0 for r in responses]

def efficiency_reward_func(prompts, completions, **kwargs):
    """The 'NVIDIA' Reward: High reward for choosing an expert only when needed."""
    rewards = []
    for prompt, completion in zip(prompts, completions):
        is_sql = "sql" in prompt.lower() or "table" in prompt.lower()
        uses_expert = "call_expert: sql_specialist" in completion

        if is_sql and uses_expert:
            rewards.append(2.0) # Big win: used the right tool
        elif not is_sql and uses_expert:
            rewards.append(-1.0) # Penalty: wasted resources
        else:
            rewards.append(0.5)
    return rewards

# 2. Configure the GRPO Trainer
training_args = GRPOConfig(
    learning_rate = 5e-6, # RL needs a much lower learning rate than SFT
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    logging_steps = 1,
    bf16 = True, # Perfect for L4
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    num_generations = 4, # How many 'thoughts' to compare at once
    max_prompt_length = 512,
    max_completion_length = 512,
    max_steps = 50, # Keep it short for the challenge test
    report_to = "none",
)

trainer = GRPOTrainer(
    model = model,
    reward_funcs = [format_reward_func, efficiency_reward_func],
    args = training_args,
    train_dataset = dataset,
)

trainer.train()

Unsloth: UnslothBCOTrainer is already patched.
Unsloth: UnslothCPOTrainer is already patched.
Unsloth: UnslothDPOTrainer is already patched.
Unsloth: UnslothGKDTrainer is already patched.
Unsloth: UnslothGRPOTrainer is already patched.
Unsloth: UnslothKTOTrainer is already patched.
Unsloth: UnslothNashMDTrainer is already patched.
Unsloth: UnslothOnlineDPOTrainer is already patched.
Unsloth: UnslothORPOTrainer is already patched.
Unsloth: UnslothPPOTrainer is already patched.
Unsloth: UnslothPRMTrainer is already patched.
Unsloth: UnslothRewardTrainer is already patched.
Unsloth: UnslothRLOOTrainer is already patched.
Unsloth: UnslothSFTTrainer is already patched.
Unsloth: UnslothXPOTrainer is already patched.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4 | Num Epochs = 13 | Total steps = 50
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / format_reward_func / mean,rewards / format_reward_func / std,rewards / efficiency_reward_func / mean,rewards / efficiency_reward_func / std
1,0.0028,1.5,0.0,27.0,27.0,27.0,0.0,27.0,27.0,27.0,0,0,0,0,0,2.798942,1.0,0.0,0.5,0.0
2,0.0024,1.5,0.0,35.0,35.0,35.0,0.0,35.0,35.0,35.0,No Log,No Log,No Log,No Log,No Log,2.390065,1.0,0.0,0.5,0.0
3,0.0027,3.0,0.0,39.0,39.0,39.0,0.0,39.0,39.0,39.0,No Log,No Log,No Log,No Log,No Log,2.660724,1.0,0.0,2.0,0.0
4,0.0032,1.5,0.0,39.0,39.0,39.0,0.0,39.0,39.0,39.0,No Log,No Log,No Log,No Log,No Log,3.166916,1.0,0.0,0.5,0.0
5,0.0024,1.5,0.0,35.0,35.0,35.0,0.0,35.0,35.0,35.0,No Log,No Log,No Log,No Log,No Log,2.390065,1.0,0.0,0.5,0.0
6,0.0027,3.0,0.0,39.0,39.0,39.0,0.0,39.0,39.0,39.0,No Log,No Log,No Log,No Log,No Log,2.66072,1.0,0.0,2.0,0.0
7,0.0032,1.5,0.0,39.0,39.0,39.0,0.0,39.0,39.0,39.0,No Log,No Log,No Log,No Log,No Log,3.166913,1.0,0.0,0.5,0.0
8,0.0028,1.5,0.0,27.0,27.0,27.0,0.0,27.0,27.0,27.0,No Log,No Log,No Log,No Log,No Log,2.798926,1.0,0.0,0.5,0.0
9,0.0032,1.5,0.0,39.0,39.0,39.0,0.0,39.0,39.0,39.0,No Log,No Log,No Log,No Log,No Log,3.16691,1.0,0.0,0.5,0.0
10,0.0027,3.0,0.0,39.0,39.0,39.0,0.0,39.0,39.0,39.0,No Log,No Log,No Log,No Log,No Log,2.660715,1.0,0.0,2.0,0.0


TrainOutput(global_step=50, training_loss=0.0027909335121512413, metrics={'train_runtime': 411.4165, 'train_samples_per_second': 0.486, 'train_steps_per_second': 0.122, 'total_flos': 0.0, 'train_loss': 0.0027909335121512413})

In [8]:
# 1. Save the LoRA adapters (small file, perfect for sharing)
model.save_pretrained("orchestrator_lora_model")
tokenizer.save_pretrained("orchestrator_lora_model")

# 2. (Optional) Export to GGUF for the "Efficiency" part of the challenge
# This allows the model to run on almost any hardware (CPU/GPU)
model.save_pretrained_gguf("orchestrator_model_gguf", tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/956 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  25%|██▌       | 1/4 [00:13<00:40, 13.45s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 2/4 [00:26<00:26, 13.38s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  75%|███████▌  | 3/4 [00:40<00:13, 13.41s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [00:43<00:00, 10.83s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [01:07<00:00, 16.97s/it]


Unsloth: Merge process complete. Saved to `/content/orchestrator_model_gguf`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF bf16 might take 3 minutes.
\        /    [2] Converting GGUF bf16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...




Unsloth: [1] Converting model into bf16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['orchestrator_model_gguf_gguf/Meta-Llama-3.1-8B-Instruct.BF16.gguf']
Unsloth: [2] Converting GGUF bf16 into q4_k_m. This might take 10 minutes...
Unsloth: Model files cleanup...
Unsloth: All GGUF conversions completed successfully!
Generated files: ['orchestrator_model_gguf_gguf/Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf']
Unsloth: example usage for text only LLMs: llama.cpp/llama-cli --model orchestrator_model_gguf_gguf/Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf -p "why is the sky blue?"
Unsloth: Saved Ollama Modelfile to orchestrator_model_gguf_gguf/Modelfile
Unsloth: convert model to ollama format by running - ollama create model_name -f orchestrator_model_gguf_gguf/Modelfile


{'save_directory': 'orchestrator_model_gguf',
 'gguf_directory': 'orchestrator_model_gguf_gguf',
 'gguf_files': ['orchestrator_model_gguf_gguf/Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf'],
 'modelfile_location': 'orchestrator_model_gguf_gguf/Modelfile',
 'want_full_precision': False,
 'is_vlm': False,
 'fix_bos_token': False}

In [9]:
# Test the model locally in Colab
!./llama.cpp/llama-cli --model orchestrator_model_gguf_gguf/Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf \
-p "### Instruction:\nYou are an AI Orchestrator. Reason and route.\n\n### Input:\nWrite a SQL query for the users table.\n\n### Response:\n" \
-n 128


Loading model... |-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/- 


▄▄ ▄▄
██ ██
██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
                                    ██    ██
                                    ▀▀    ▀▀

build      : b8124-35715657c
model      : Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf
modal

In [10]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

from huggingface_hub import HfApi, create_repo

# 1. Configuration
repo_id = "frankmorales2020/Llama-3.1-8B-Orchestrator-GGUF"
source_folder = "orchestrator_model_gguf_gguf" # The folder generated by Unsloth

# 2. Create the repository on the Hub (if it doesn't exist)
create_repo(repo_id, token=HF_TOKEN, exist_ok=True, repo_type="model")

# 3. Upload the entire GGUF folder
api = HfApi()
api.upload_folder(
    folder_path=source_folder,
    repo_id=repo_id,
    token=HF_TOKEN,
    commit_message="Add GRPO-trained Orchestrator GGUF (Q4_K_M) and Ollama Modelfile",
)

print(f"✅ Mission Accomplished! View your model here: https://huggingface.co/{repo_id}")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...1-8B-Instruct.Q4_K_M.gguf:   0%|          |  559kB / 4.92GB            

✅ Mission Accomplished! View your model here: https://huggingface.co/frankmorales2020/Llama-3.1-8B-Orchestrator-GGUF


In [12]:
# Set the environment variables for CUDA compilation
%env CMAKE_ARGS=-DGGML_CUDA=on
!pip install llama-cpp-python

env: CMAKE_ARGS=-DGGML_CUDA=on
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl size=33390367 sha256=901c35e788d086b9656af21952fe25de8e627a89ad35243fe9a871034d23b459
  Stored in directory: /root/.cache/pip/wheels/90/82/ab/8784ee3fb99ddb07fd36a679ddbe63122cc07718f6c1eb3be8
Successfully built llama-cpp-python
Installing collected packages: llama-cpp-python
Successfully installed 

In [None]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# 1. Configuration
repo_id = "frankmorales2020/Llama-3.1-8B-Orchestrator-GGUF"
filename = "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf"

# 2. Download from HF
model_path = hf_hub_download(repo_id=repo_id, filename=filename)

# 3. Initialize with GPU Offloading
llm = Llama(
    model_path=model_path,
    n_gpu_layers=-1, # Offload all 32+ layers to your L4
    n_ctx=2048,
)

# 4. Test Orchestration
prompt = "### Instruction:\nYou are an AI Orchestrator. Reason and route.\n\n### Input:\nI need a philosophical analysis of Kierkegaard's 'Fear and Trembling'.\n\n### Response:\n"

output = llm(prompt, max_tokens=128, stop=["###"])
print(output["choices"][0]["text"])

In [None]:
# 1. Install & Build llama-cpp-python with CUDA support (Optimized for L4)
%env CMAKE_ARGS=-DGGML_CUDA=on
!pip install llama-cpp-python huggingface_hub -q

In [16]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# 2. Configuration
REPO_ID = "frankmorales2020/Llama-3.1-8B-Orchestrator-GGUF"
FILENAME = "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf"

# 3. Download the specific GGUF file
print(f"📥 Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME, token=HF_TOKEN)

# 4. Initialize the Orchestrator with full GPU Offloading
# n_gpu_layers=-1 ensures all 32 layers are on the L4 VRAM
print("🧠 Initializing Manager Model on GPU...")
llm = Llama(
    model_path=model_path,
    n_gpu_layers=-1,
    n_ctx=2048,
    verbose=False
)

# 5. The Corrected "Agency Router" Function
def route_query(user_query):
    # This matches the SFT/GRPO prompt format exactly
    prompt = f"### Instruction:\nYou are an AI Orchestrator. Reason and route.\n\n### Input:\n{user_query}\n\n### Response:\n"

    output = llm(
        prompt,
        max_tokens=150,      # Give it room to reason
        temperature=0.7,     # Slight creativity helps GRPO exploration
        repeat_penalty=1.1,  # Prevent looping on tokens
        stop=["<|eot_id|>", "### Instruction:"], # Stop only at the true end
        echo=False
    )

    response = output["choices"][0]["text"].strip()
    return response

# 6. LIVE TEST: Prove the "NVIDIA Way" works
test_queries = [
    "Write a SQL query for the employees table to find the top earners.",
    "Explain the existential dread in Kierkegaard's 'Fear and Trembling'.",
    "Transcribe this audio file hash: 82dbe5484a19171e1c98043838fbf7c3ebd9374f"
]

print("\n--- 🕵️ AI Agency Routing Results ---")
for q in test_queries:
    print(f"\nQUERY: {q}")
    result = route_query(q)
    print(f"DECISION:\n{result if result else '⚠️ No decision generated. Check prompt alignment.'}")
    print("-" * 30)

📥 Downloading Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf from frankmorales2020/Llama-3.1-8B-Orchestrator-GGUF...
🧠 Initializing Manager Model on GPU...


llama_context: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized



--- 🕵️ AI Agency Routing Results ---

QUERY: Write a SQL query for the employees table to find the top earners.
DECISION:
### Reasoning
This is a narrow multi-modal transcription. I should route this to the text-to-sql expert.
### Action
call_expert: sql_specialist
------------------------------

QUERY: Explain the existential dread in Kierkegaard's 'Fear and Trembling'.
DECISION:
### Reason
This is a complex philosophical inquiry. I should route this to the philosophy expert.
### Action
call_expert: philosophy_specialist
------------------------------

QUERY: Transcribe this audio file hash: 82dbe5484a19171e1c98043838fbf7c3ebd9374f
DECISION:
### Reasoning
This is a narrow multi-modal transcription SLM. I should route this to the transcription expert.
### Action
call_expert: transcription_specialist
------------------------------


In [17]:
def execute_routing(decision_text):
    if "call_expert:" in decision_text:
        # Extract the string after 'call_expert:'
        expert_name = decision_text.split("call_expert:")[-1].strip()

        print(f"📡 System: Routing task to [{expert_name}]...")

        # Here is where you would call your specialized LoRA or API
        if expert_name == "sql_specialist":
            # load_sql_expert()
            pass
        elif expert_name == "philosophy_specialist":
            # load_phil_expert()
            pass

    return expert_name

# Example usage:
last_decision = "### Action\ncall_expert: sql_specialist"
execute_routing(last_decision)

📡 System: Routing task to [sql_specialist]...


'sql_specialist'