# Phoenix Context Collapse Parser (CCP) — Fine-Tuning Notebook

This notebook fine-tunes a small language model to become the **Context Collapse Parser** — a domain-specific intent parser that translates ambiguous GTM prompts into structured **GTM Intent IR** (JSON).

**What CCP does:**
- Infers implied GTM context (role, motion, ICP, geography, time horizon)
- Outputs structured JSON with confidence scores
- Enables downstream LLMs to execute reliably via Phoenix MCP tools

**Environment:**
- macOS M-series with Metal/MPS
- Python 3.11
- QLoRA fine-tuning (4-bit quantization)

**See:** `PRD.md` for full architecture and `gtm_domain_knowledge.md` for training context.

## Cell 1 — Environment sanity check


In [None]:
import torch
import platform

print("Python:", platform.python_version())
print("Torch:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())
print("MPS built:", torch.backends.mps.is_built())


## Cell 2 — CCP Configuration

In [None]:
# ===== CCP CONFIG =====

# Base model — recommend ~3B param model for fast inference
# Options: "microsoft/phi-2", "mistralai/Mistral-7B-Instruct-v0.2", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
BASE_MODEL_PATH = "./models/phi-2"

# Training dataset (JSONL with gtm_prompt -> intent_ir pairs)
DATASET_PATH = "./data/ccp_training.jsonl"

# Output directory for CCP LoRA adapter
OUTPUT_DIR = "./ccp-adapter"

# ===== GTM INTENT IR SCHEMA VERSION =====
IR_SCHEMA_VERSION = "1.0.0"

# ===== TRAINING PARAMS =====
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
BATCH_SIZE = 1
GRAD_ACCUM_STEPS = 8
MAX_SEQ_LENGTH = 1024  # GTM prompts are short, IR is compact

## Cell 3 — Imports


In [None]:
import json
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
import transformers

## Cell 4 — Load tokenizer


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_PATH,
    local_files_only=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


## Cell 5 — Load model (QLoRA, Metal-safe)

⚠️ Important:
- We **do NOT use fp16** on macOS
- 4-bit quantization still works via bitsandbytes


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float32,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    quantization_config=bnb_config,
    device_map={"": "mps"},
    local_files_only=True,
)

model.config.use_cache = False
model.gradient_checkpointing_enable()


## Cell 6 — LoRA configuration


In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


## Cell 7 — Load dataset


In [None]:
dataset = load_dataset(
    "json",
    data_files=DATASET_PATH,
    split="train"
)


## Cell 8 — GTM Intent IR Schema

Define the structured output schema that CCP must produce.

In [None]:
# GTM Intent IR Schema (v1)
GTM_INTENT_IR_SCHEMA = {
    "intent_type": [
        "account_discovery", "pipeline_analysis", "expansion_identification",
        "churn_risk_assessment", "lead_prioritization", "territory_planning",
        "forecast_review", "competitive_analysis", "engagement_summary"
    ],
    "motion": ["outbound", "inbound", "expansion", "renewal", "churn_prevention"],
    "role_assumption": ["sales_rep", "sales_manager", "revops", "marketing", "cs", "exec"],
    "account_scope": ["net_new", "existing", "churned", "all"],
    "time_horizon": ["immediate", "this_week", "this_month", "this_quarter", "this_year", "custom"],
    "output_format": ["list", "summary", "detailed", "export", "visualization"],
}

# System prompt for CCP
CCP_SYSTEM_PROMPT = """You are the Phoenix Context Collapse Parser (CCP). Your job is to transform ambiguous GTM (Go-To-Market) prompts into structured GTM Intent IR.

Given a user's GTM request, output a JSON object with:
- intent_type: The primary intent category
- motion: The GTM motion (outbound, expansion, renewal, etc.)
- role_assumption: Inferred user role
- account_scope: Which accounts (net_new, existing, all)
- icp_selector: Which ICP to apply (default, or specific product/segment)
- icp_resolution_required: true if ICP needs downstream resolution
- geography_scope: Geographic filter if mentioned (null if global)
- time_horizon: Time scope for the request
- output_format: How results should be presented
- confidence_scores: 0.0-1.0 confidence for each inferred field
- assumptions_applied: List of assumptions made
- clarification_needed: true if request is too ambiguous

Output ONLY valid JSON. No explanation."""

def format_ccp_example(example):
    """Format training example for CCP: GTM prompt -> Intent IR JSON"""
    prompt = example["gtm_prompt"].strip()
    
    # Build the IR output (either from pre-built ir field or construct it)
    if "intent_ir" in example:
        ir_json = example["intent_ir"]
        if isinstance(ir_json, str):
            ir_output = ir_json
        else:
            ir_output = json.dumps(ir_json, indent=2)
    else:
        # Legacy format compatibility
        ir_output = example.get("response", "{}").strip()
    
    # Format as instruction -> JSON output
    text = f"<s>[INST] {CCP_SYSTEM_PROMPT}\n\nUser request: {prompt} [/INST]\n{ir_output}</s>"
    return {"text": text}

dataset = dataset.map(format_ccp_example)

## Cell 9 — Tokenization


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding=False,
    )

tokenized_dataset = dataset.map(
    tokenize,
    remove_columns=dataset.column_names
)

## Cell 10 — Data collator


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


## Cell 11 — Training arguments


In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    optim="adamw_torch",
    bf16=False,
    fp16=False,
)


## Cell 12 — Trainer


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)


## Cell 13 — Train CCP

Training a ~3B model with QLoRA on GTM intent parsing.
Expect 1-3 hours depending on dataset size.

In [None]:
trainer.train()


## Cell 14 — Save adapter

This saves **LoRA only**.


In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


## Cell 15 — CCP Inference & Validation

Test the trained CCP model with GTM prompts and validate JSON output.

In [None]:
def parse_gtm_intent(user_prompt: str, max_new_tokens: int = 500) -> dict:
    """
    Run CCP inference: GTM prompt -> Intent IR JSON
    """
    input_text = f"<s>[INST] {CCP_SYSTEM_PROMPT}\n\nUser request: {user_prompt} [/INST]\n"
    inputs = tokenizer(input_text, return_tensors="pt").to("mps")
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,  # Low temp for structured output
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    
    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract JSON from response (after [/INST])
    if "[/INST]" in generated:
        json_str = generated.split("[/INST]")[-1].strip()
    else:
        json_str = generated
    
    # Remove trailing </s> if present
    json_str = json_str.replace("</s>", "").strip()
    
    # Validate and parse JSON
    try:
        intent_ir = json.loads(json_str)
        intent_ir["_valid"] = True
        intent_ir["_raw"] = json_str
    except json.JSONDecodeError as e:
        intent_ir = {
            "_valid": False,
            "_error": str(e),
            "_raw": json_str
        }
    
    return intent_ir


def validate_intent_ir(ir: dict) -> list[str]:
    """Validate IR against schema, return list of issues"""
    issues = []
    
    if not ir.get("_valid", False):
        issues.append(f"Invalid JSON: {ir.get('_error', 'unknown')}")
        return issues
    
    # Check required fields
    required = ["intent_type", "motion", "role_assumption", "account_scope"]
    for field in required:
        if field not in ir:
            issues.append(f"Missing required field: {field}")
    
    # Validate enum values
    for field, valid_values in GTM_INTENT_IR_SCHEMA.items():
        if field in ir and ir[field] not in valid_values:
            issues.append(f"Invalid {field}: {ir[field]} (valid: {valid_values})")
    
    # Check confidence scores
    if "confidence_scores" in ir:
        for field, score in ir["confidence_scores"].items():
            if not isinstance(score, (int, float)) or not 0 <= score <= 1:
                issues.append(f"Invalid confidence score for {field}: {score}")
    
    return issues


# Test CCP with example GTM prompts
TEST_PROMPTS = [
    "Show me my best accounts",
    "Which deals are at risk this quarter?",
    "Find me companies like Acme Corp",
    "I need to hit my number, what should I focus on?",
    "Give me expansion opportunities in EMEA",
]

print("=" * 60)
print("CCP INFERENCE TEST")
print("=" * 60)

for prompt in TEST_PROMPTS:
    print(f"\nPrompt: {prompt}")
    print("-" * 40)
    
    ir = parse_gtm_intent(prompt)
    issues = validate_intent_ir(ir)
    
    if ir.get("_valid"):
        # Pretty print the IR (excluding internal fields)
        display_ir = {k: v for k, v in ir.items() if not k.startswith("_")}
        print(json.dumps(display_ir, indent=2))
    else:
        print(f"[INVALID] {ir.get('_raw', '')[:200]}")
    
    if issues:
        print(f"\nValidation issues: {issues}")

## Cell 16 — Save CCP Adapter

Saves the LoRA adapter. Include schema version in metadata.

In [None]:
import os

# Save adapter
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Save CCP metadata
ccp_metadata = {
    "schema_version": IR_SCHEMA_VERSION,
    "base_model": BASE_MODEL_PATH,
    "intent_types": GTM_INTENT_IR_SCHEMA["intent_type"],
    "motions": GTM_INTENT_IR_SCHEMA["motion"],
    "role_assumptions": GTM_INTENT_IR_SCHEMA["role_assumption"],
}

with open(os.path.join(OUTPUT_DIR, "ccp_metadata.json"), "w") as f:
    json.dump(ccp_metadata, f, indent=2)

print(f"CCP adapter saved to {OUTPUT_DIR}")