# 🎓 BigAcademy + Argilla Integration

This notebook demonstrates the complete workflow for:
1. Generating datasets with BigAcademy
2. Uploading to Argilla for human review
3. Downloading enhanced datasets
4. Analyzing annotation quality

## 🔧 Setup Environment

In [None]:
import sys
import os
from pathlib import Path

# Add BigAcademy to path
sys.path.append('/home/jovyan/work')

# Import BigAcademy components
from bigacademy.core.agent_profiles import AgentProfileManager
from bigacademy.core.graph_db import GraphDB
from bigacademy.generators.prompt_templates import PromptTemplateManager
from bigacademy.generators.dataset_generator import DatasetGenerator

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("✅ BigAcademy components imported successfully")

## 📊 1. Generate Sample Dataset

In [None]:
# Load components
agent_manager = AgentProfileManager(Path("/home/jovyan/work/configs/agents"))
template_manager = PromptTemplateManager(Path("/home/jovyan/work/configs/templates"))

# Load existing graph database (if available)
db_path = Path("/home/jovyan/work/test_data/fastapi_minimal_architect.db")
if db_path.exists():
    graph_db = GraphDB(db_path)
    dataset_generator = DatasetGenerator(graph_db, template_manager)
    
    # Load agent profile
    architect = agent_manager.get_profile("solution_architect")
    
    print(f"🤖 Loaded agent: {architect.name}")
    print(f"📚 Graph database: {db_path}")
else:
    print("⚠️  No existing graph database found. Run knowledge extraction first.")

In [None]:
# Generate small dataset for demonstration
if 'dataset_generator' in locals():
    print("🎯 Generating sample dataset...")
    
    dataset_batches = dataset_generator.generate_agent_dataset(
        agent_profile=architect,
        template_types=["question_answer", "code_review"],
        max_samples_per_template=3,
        min_relevance_score=0.15
    )
    
    # Save datasets
    saved_files = dataset_generator.save_dataset_batches(dataset_batches, format="jsonl")
    
    print(f"✅ Generated {len(dataset_batches)} batches")
    print(f"📁 Saved files: {[f.name for f in saved_files]}")
else:
    print("❌ Dataset generator not available")

## 🌐 2. Upload to Argilla

In [None]:
# Upload datasets to Argilla for review
import subprocess

if 'saved_files' in locals() and saved_files:
    for file_path in saved_files:
        print(f"📤 Uploading {file_path.name} to Argilla...")
        
        try:
            result = subprocess.run([
                "python", "/home/jovyan/work/scripts/upload_to_argilla.py",
                str(file_path),
                "--overwrite"
            ], capture_output=True, text=True, cwd="/home/jovyan/work")
            
            if result.returncode == 0:
                print(f"✅ Upload successful: {file_path.name}")
            else:
                print(f"❌ Upload failed: {result.stderr}")
                
        except Exception as e:
            print(f"❌ Error uploading {file_path.name}: {e}")
else:
    print("⚠️  No dataset files to upload")

## 🎨 3. Argilla UI Instructions

Now you can:

1. **Open Argilla UI**: http://localhost:6900
2. **Login**: admin / bigacademy123
3. **Review datasets** in the BigAcademy workspace
4. **Rate samples** (excellent, good, fair, poor, terrible)
5. **Edit responses** to improve quality
6. **Add annotations** and feedback

After reviewing, come back to download the enhanced dataset!

## 📥 4. Download Enhanced Dataset

In [None]:
# Download enhanced dataset from Argilla
dataset_name = "solution_architect_question_answer"  # Adjust based on your upload

print(f"📥 Downloading enhanced dataset: {dataset_name}")

try:
    result = subprocess.run([
        "python", "/home/jovyan/work/scripts/download_from_argilla.py",
        dataset_name,
        "--analyze",
        "--format", "jsonl"
    ], capture_output=True, text=True, cwd="/home/jovyan/work")
    
    if result.returncode == 0:
        print("✅ Download successful")
        print(result.stdout)
    else:
        print(f"❌ Download failed: {result.stderr}")
        
except Exception as e:
    print(f"❌ Error downloading: {e}")

## 📊 5. Analyze Enhanced Dataset

In [None]:
# Load and analyze enhanced dataset
enhanced_files = list(Path("/home/jovyan/work/datasets").glob("enhanced_*.jsonl"))

if enhanced_files:
    latest_file = max(enhanced_files, key=lambda f: f.stat().st_mtime)
    print(f"📊 Analyzing: {latest_file.name}")
    
    # Load enhanced dataset
    enhanced_samples = []
    with open(latest_file, 'r') as f:
        for line in f:
            enhanced_samples.append(json.loads(line))
    
    print(f"📈 Loaded {len(enhanced_samples)} enhanced samples")
    
    # Extract quality scores
    quality_scores = []
    annotations = []
    
    for sample in enhanced_samples:
        metadata = sample.get('metadata', {})
        quality_score = metadata.get('quality_score')
        annotation = metadata.get('human_annotation')
        
        if quality_score:
            quality_scores.append(quality_score)
        if annotation:
            annotations.append(annotation)
    
    print(f"🎯 Annotated samples: {len(annotations)}")
    if quality_scores:
        print(f"📊 Average quality: {sum(quality_scores)/len(quality_scores):.2f}/5")
        
else:
    print("⚠️  No enhanced datasets found")

In [None]:
# Visualize quality distribution
if 'annotations' in locals() and annotations:
    plt.figure(figsize=(10, 6))
    
    # Quality distribution
    plt.subplot(1, 2, 1)
    annotation_counts = pd.Series(annotations).value_counts()
    plt.pie(annotation_counts.values, labels=annotation_counts.index, autopct='%1.1f%%')
    plt.title('Quality Distribution')
    
    # Quality scores histogram
    if quality_scores:
        plt.subplot(1, 2, 2)
        plt.hist(quality_scores, bins=5, edgecolor='black', alpha=0.7)
        plt.xlabel('Quality Score')
        plt.ylabel('Count')
        plt.title('Quality Score Distribution')
        plt.xticks(range(1, 6))
    
    plt.tight_layout()
    plt.show()
    
else:
    print("📊 No annotation data to visualize")

## 🚀 6. Next Steps

Your enhanced dataset is ready for training! 

**With BigTune:**
```bash
bigtune train --dataset="enhanced_solution_architect_*.jsonl" \
              --base-model="llama-3.1-8b" \
              --technique="lora"
```

**Quality Insights:**
- Use annotation analysis to improve BigAcademy templates
- Focus on high-quality knowledge sources
- Iterate on prompt engineering based on human feedback