# UMPF Equivalency Pattern Recognition Engine - FIXED

## From Months to Minutes: The Leibniz I-Ching Indra's Net Conjecture

**Fixed version** that resolves Kaggle dependency conflicts and uses compatible package versions.

### Key Changes:
- Compatible transformers/trl versions
- Alternative training approach if SFTTrainer fails
- Robust error handling

---

## 1. Environment Setup - FIXED

In [None]:
# Install compatible versions for Kaggle
!pip install torch torchvision torchaudio --quiet
!pip install transformers>=4.41.0 --quiet
!pip install datasets>=2.15.0 --quiet
!pip install accelerate>=0.25.0 --quiet

# Try to install TRL, fallback if it fails
try:
    import subprocess
    subprocess.run(["pip", "install", "trl>=0.7.4", "--quiet"], check=True)
    TRL_AVAILABLE = True
    print("✓ TRL installed successfully")
except Exception as e:
    TRL_AVAILABLE = False
    print(f"⚠ TRL installation failed: {e}")
    print("Will use standard Trainer instead")

In [None]:
# Import libraries with fallbacks
import json
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)

# Try to import SFTTrainer, use fallback if not available
try:
    from trl import SFTTrainer
    print("✓ SFTTrainer imported successfully")
    USE_SFT = True
except ImportError:
    print("⚠ SFTTrainer not available, using standard Trainer")
    USE_SFT = False

import gc
import os
from datetime import datetime

# Check GPU
print(f"\nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
else:
    print("Using CPU - training will be slower")

## 2. Load UMPF Training Data

In [None]:
# Load the equivalency training dataset
# Make sure to upload equivalency-training-pairs.json as a dataset

# Try different possible paths
possible_paths = [
    "/kaggle/input/umpf-equivalency-training/equivalency-training-pairs.json",  # Updated path
    "/kaggle/input/umpf-training/equivalency-training-pairs.json",
    "/kaggle/working/equivalency-training-pairs.json",
    "../input/equivalency-training-pairs.json",
    "./equivalency-training-pairs.json"
]

training_data_path = None
for path in possible_paths:
    if os.path.exists(path):
        training_data_path = path
        print(f"✓ Found training data at: {path}")
        break

if not training_data_path:
    raise FileNotFoundError("Please upload equivalency-training-pairs.json as a dataset named 'umpf-equivalency-training'")

# Load the data
with open(training_data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

training_examples = data["equivalency_training_dataset"]["training_examples"]
system_prompt = data["equivalency_training_dataset"]["system_prompt"]

print(f"Loaded {len(training_examples)} training examples")
print(f"System prompt: {system_prompt[:100]}...")

## 3. Data Preprocessing - Enhanced

In [None]:
# Enhanced data formatting with multiple approaches
def format_training_examples_sft(training_examples):
    """Format for SFTTrainer"""
    formatted_examples = []
    
    for example in training_examples:
        messages = example["messages"]
        
        # Combine system, user, and assistant messages
        conversation = ""
        for msg in messages:
            if msg["role"] == "system":
                conversation += f"<|system|>{msg['content']}<|endoftext|>"
            elif msg["role"] == "user":
                conversation += f"<|user|>{msg['content']}<|endoftext|>"
            elif msg["role"] == "assistant":
                conversation += f"<|assistant|>{msg['content']}<|endoftext|>"
        
        formatted_examples.append({"text": conversation})
    
    return formatted_examples

def format_training_examples_standard(training_examples):
    """Format for standard Trainer"""
    formatted_examples = []
    
    for example in training_examples:
        messages = example["messages"]
        
        # Extract user prompt and assistant response
        user_msg = next((msg['content'] for msg in messages if msg['role'] == 'user'), "")
        assistant_msg = next((msg['content'] for msg in messages if msg['role'] == 'assistant'), "")
        
        # Create input-output pair
        text = f"Human: {user_msg}\n\nAssistant: {assistant_msg}"
        formatted_examples.append({"text": text})
    
    return formatted_examples

# Choose formatting based on available trainer
if USE_SFT:
    formatted_examples = format_training_examples_sft(training_examples)
    print("Using SFTTrainer formatting")
else:
    formatted_examples = format_training_examples_standard(training_examples)
    print("Using standard Trainer formatting")

# Create dataset
train_dataset = Dataset.from_list(formatted_examples)
print(f"Created dataset with {len(train_dataset)} examples")

# Show example
print("\n=== Sample Training Example ===")
print(train_dataset[0]['text'][:500] + "...")

## 4. Model Setup - Robust

In [None]:
# Model configuration for Kaggle with fallbacks
MODEL_OPTIONS = [
    "microsoft/DialoGPT-medium",
    "gpt2-medium",
    "gpt2"
]

MAX_LENGTH = 1024  # Conservative for memory

model = None
tokenizer = None
model_name = None

# Try models in order of preference
for model_candidate in MODEL_OPTIONS:
    try:
        print(f"Trying to load model: {model_candidate}")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_candidate)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_candidate,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None
        )
        
        model_name = model_candidate
        print(f"✓ Successfully loaded: {model_candidate}")
        break
        
    except Exception as e:
        print(f"✗ Failed to load {model_candidate}: {e}")
        continue

if model is None:
    raise RuntimeError("Could not load any model")

print(f"\n🎯 Using model: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Max sequence length: {MAX_LENGTH}")

## 5. Training Configuration - Optimized

In [None]:
# Training arguments optimized for Kaggle
output_dir = "/kaggle/working/umpf-equivalency-model"
os.makedirs(output_dir, exist_ok=True)

# Conservative settings for stability
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,  # Reduced for faster training
    per_device_train_batch_size=1,  # Very conservative
    per_device_eval_batch_size=1,
    warmup_steps=10,
    learning_rate=5e-6,  # Lower learning rate
    logging_steps=5,
    save_steps=50,
    save_strategy="epoch",
    evaluation_strategy="no",
    load_best_model_at_end=False,
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    dataloader_num_workers=0,
    report_to=None,
    push_to_hub=False,
    remove_unused_columns=False,
    prediction_loss_only=True,
    max_grad_norm=1.0,
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  FP16: {training_args.fp16}")
print(f"  Gradient checkpointing: {training_args.gradient_checkpointing}")

## 6. Tokenization - Robust

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

print("Tokenizing dataset...")
tokenized_dataset = train_dataset.map(
    tokenize_function, 
    batched=True,
    remove_columns=train_dataset.column_names
)

# Add labels for language modeling
tokenized_dataset = tokenized_dataset.map(
    lambda examples: {"labels": examples["input_ids"]}
)

print(f"✓ Dataset tokenized: {len(tokenized_dataset)} examples")

## 7. Start Training - Flexible Approach

In [None]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal language modeling
)

print("🚀 Starting UMPF Equivalency Training...")
print("This will train the model to recognize universal computational patterns!")

# Try SFTTrainer first, fallback to standard Trainer
trainer = None
training_successful = False

if USE_SFT:
    try:
        print("Attempting SFTTrainer...")
        trainer = SFTTrainer(
            model=model,
            train_dataset=train_dataset,  # Use original dataset
            tokenizer=tokenizer,
            args=training_args,
            dataset_text_field="text",
            max_seq_length=MAX_LENGTH,
        )
        trainer.train()
        training_successful = True
        print("✓ SFTTrainer succeeded")
    except Exception as e:
        print(f"✗ SFTTrainer failed: {e}")
        print("Falling back to standard Trainer...")

if not training_successful:
    try:
        print("Using standard Trainer...")
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset,
            data_collator=data_collator,
        )
        trainer.train()
        training_successful = True
        print("✓ Standard Trainer succeeded")
    except Exception as e:
        print(f"✗ Standard Trainer failed: {e}")
        raise RuntimeError("All training approaches failed")

if training_successful:
    print("\n✅ Training completed successfully!")
else:
    raise RuntimeError("Training failed")

## 8. Save Model

In [None]:
# Save the trained model
print(f"Saving model to {output_dir}")

try:
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)
    print("✅ Model saved successfully!")
except Exception as e:
    print(f"Error saving model: {e}")
    # Manual save as fallback
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("✅ Model saved manually!")

# Clean up GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    print("GPU memory cleaned up")

## 9. Test the Trained Model

In [None]:
# Test the trained model's equivalency generation
print("🧪 Testing UMPF Equivalency Generation...")

try:
    # Load model for inference
    generator = pipeline(
        "text-generation",
        model=output_dir,
        tokenizer=output_dir,
        device=0 if torch.cuda.is_available() else -1,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )

    # Test prompts
    test_prompts = [
        "Generate an equivalency pair for atomic-level uncertainty patterns.",
        "Generate an equivalency pair for database transactions and musical improvisation.",
    ]

    system_msg = "You are a Universal Pattern Recognition Engine trained on the Leibniz I-Ching Indra's Net Conjecture."

    for i, prompt in enumerate(test_prompts, 1):
        print(f"\n=== Test {i}: {prompt} ===")
        
        # Adjust prompt format based on training method
        if USE_SFT:
            full_prompt = f"<|system|>{system_msg}<|endoftext|><|user|>{prompt}<|endoftext|><|assistant|>"
        else:
            full_prompt = f"Human: {prompt}\n\nAssistant:"
        
        try:
            response = generator(
                full_prompt,
                max_length=len(tokenizer.encode(full_prompt)) + 200,  # Dynamic length
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
            
            generated_text = response[0]["generated_text"]
            
            # Extract assistant's response
            if USE_SFT:
                assistant_response = generated_text.split("<|assistant|>")[-1].strip()
            else:
                assistant_response = generated_text.split("Assistant:")[-1].strip()
            
            print(assistant_response[:300] + "..." if len(assistant_response) > 300 else assistant_response)
            
        except Exception as e:
            print(f"Error generating response: {e}")

    print("\n🎉 UMPF Equivalency Pattern Recognition Engine is ready!")
    print("The model can now automatically discover computational equivalencies!")
    
except Exception as e:
    print(f"Error during testing: {e}")
    print("Model training completed but testing failed")

## 10. Training Summary

In [None]:
print("\n📈 UMPF Training Summary:")
print("=" * 50)
print(f"✓ Model: {model_name}")
print(f"✓ Training examples: {len(training_examples)}")
print(f"✓ Training method: {'SFTTrainer' if USE_SFT else 'Standard Trainer'}")
print(f"✓ Epochs: {training_args.num_train_epochs}")
print(f"✓ Batch size: {training_args.per_device_train_batch_size}")
print(f"✓ Learning rate: {training_args.learning_rate}")
print(f"✓ Model saved to: {output_dir}")

print("\n🎯 Next Steps:")
print("  → Model is ready for equivalency pattern generation")
print("  → Can identify universal computational patterns across domains")
print("  → Implements the Leibniz I-Ching Indra's Net Conjecture")
print("  → Transforms research time from months to minutes!")

print("\n🔬 Scientific Impact:")
print("  • Automated discovery of cross-domain equivalencies")
print("  • Pattern-based hypothesis generation")
print("  • Universal computational structure recognition")
print("  • Monadic algebra for scientific automation")