In [None]:
# Cell 1: Configuration Variables
# ============================================
# CHANGE THESE VARIABLES TO CUSTOMIZE OUTPUT
# ============================================

TOPIC = "ParallelHDF5"  # <-- Change this! Examples: "TensorFlow", "OpenMP", "MATLAB", "Conda", "Git"
RUNS = 5  # Number of variations to generate (1-10 recommended)
MODEL = 'gpt-4o-mini'  # Options: 'gpt-4', 'gpt-3.5-turbo'
TEMPERATURE = 0.2  # Creativity (0.0 = deterministic, 1.0 = very creative)

# Query template - modify if you want different phrasing
QUERY_TEMPLATE = "Create a knowledge base article with regards to using {topic} on the FASRC cluster, using the tone of graduate level Academic Computing documentation."

# Alternative query templates you can use:
# QUERY_TEMPLATE = "Generate HTML documentation for {topic} following academic computing standards"
# QUERY_TEMPLATE = "Write a technical reference page for {topic} on HPC clusters"
# QUERY_TEMPLATE = "Create a comprehensive guide for using {topic} in a research computing environment"

print(f"📌 Configuration set:")
print(f"   Topic: {TOPIC}")
print(f"   Runs: {RUNS}")
print(f"   Model: {MODEL}")
print(f"   Temperature: {TEMPERATURE}")



📌 Configuration set:
   Topic: ParallelHDF5
   Runs: 5
   Model: gpt-4o-mini
   Temperature: 0.7


In [64]:
# Cell 2: Install Dependencies (run once)
# ============================================
import subprocess
import sys

def install_dependencies():
    """Install required packages if not already installed."""
    packages = ['openai>=1.0.0', 'pyyaml', 'python-dotenv']
    for package in packages:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

# Uncomment to install
# install_dependencies()


In [75]:
# Cell 3: Import Libraries and Load Configuration
# ============================================
import os
import re
import glob
import yaml
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
from typing import List, Dict, Optional
from datetime import datetime


# Load environment variables
load_dotenv()

# Check if API key is set
if not os.getenv('OPENAI_API_KEY'):
    print("⚠️  Warning: OPENAI_API_KEY not found in environment variables!")
    print("   Please create a .env file with: OPENAI_API_KEY=your-key-here")
else:
    print("✅ OpenAI API key loaded successfully")


✅ OpenAI API key loaded successfully


DocumentationGenerator Class is the core API interaction.

In [66]:
# Cell 4: Define the DocumentationGenerator Class
# ============================================
class DocumentationGenerator:
    def __init__(self, prompt_yaml_path: str = 'prompt.yaml', examples_dir: str = 'examples/'):
        """Initialize the documentation generator with configuration."""
        self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
        self.examples_dir = Path(examples_dir)
        self.prompt_config = self._load_prompt_config(prompt_yaml_path)
        self.examples = self._load_examples()
        
    def _load_prompt_config(self, path: str) -> dict:
        """Load the prompt configuration from YAML file."""
        try:
            with open(path, 'r') as f:
                return yaml.safe_load(f)
        except FileNotFoundError:
            print(f"Warning: {path} not found. Using default configuration.")
            return {
                'system_prompt': 'You are a technical documentation expert.',
                'documentation_structure': ['Description', 'Installation', 'Usage', 'Examples', 'References']
            }
    
    def _load_examples(self) -> List[Dict[str, str]]:
        """Load few-shot examples from YAML files."""
        examples = []
        
        # Ensure examples directory exists
        self.examples_dir.mkdir(exist_ok=True)
        
        # Load YAML examples
        yaml_files = sorted(self.examples_dir.glob('*.yaml'))
        for yaml_file in yaml_files:
            try:
                with open(yaml_file, 'r') as f:
                    msgs = yaml.safe_load(f)
                    if isinstance(msgs, list):
                        examples.extend(msgs)
                    else:
                        examples.append(msgs)
            except Exception as e:
                print(f"Error loading {yaml_file}: {e}")
        
        # Load HTML examples if needed for reference
        html_files = sorted(self.examples_dir.glob('*.html'))
        for html_file in html_files:
            try:
                with open(html_file, 'r') as f:
                    content = f.read()
                    # Add as assistant example showing the expected format
                    examples.append({
                        'role': 'assistant',
                        'content': content,
                        'metadata': {'filename': html_file.name}
                    })
            except Exception as e:
                print(f"Error loading {html_file}: {e}")
        
        return examples
    
    def _extract_topic_from_query(self, query: str) -> str:
        """Extract the main topic from the query for filename generation."""
        # Try to extract topic using various patterns
        patterns = [
            r'documentation for (\w+)',
            r'using (\w+)',
            r'about (\w+)',
            r'for (\w+) documentation',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, query, re.IGNORECASE)
            if match:
                return match.group(1).lower().replace(' ', '-')
        
        # Fallback: use first significant word
        words = query.split()
        for word in words:
            if len(word) > 3 and word.lower() not in ['create', 'make', 'generate', 'write']:
                return word.lower()
        
        return 'documentation'
    
    def _build_system_prompt(self) -> str:
        """Build the system prompt from configuration."""
        base_prompt = self.prompt_config.get('system_prompt', 
            'You are a technical documentation expert creating HTML knowledge base articles.')
        
        # Add structure information if available
        if 'documentation_structure' in self.prompt_config:
            structure = self.prompt_config['documentation_structure']
            base_prompt += f"\n\nEach article should follow this structure:\n"
            base_prompt += "\n".join(f"- {section}" for section in structure)
        
        # Add any terms/definitions
        if 'terms' in self.prompt_config:
            base_prompt += "\n\nKey terms:\n"
            for term, definition in self.prompt_config['terms'].items():
                base_prompt += f"- {term}: {definition}\n"
        
        return base_prompt
    
    def generate_documentation(self, query: str, runs: int = 5, 
                             model: str = 'gpt-4', 
                             temperature: float = 0.7) -> List[str]:
        """Generate multiple documentation pages based on the query."""
        topic = self._extract_topic_from_query(query)
        generated_files = []
        
        # Build messages
        system_prompt = self._build_system_prompt()
        
        for i in range(runs):
            try:
                messages = [
                    {'role': 'system', 'content': system_prompt}
                ]
                
                # Add few-shot examples
                for example in self.examples:
                    # Only add role and content, skip metadata
                    messages.append({
                        'role': example['role'],
                        'content': example['content']
                    })
                
                # Add the actual query
                messages.append({'role': 'user', 'content': query})
                
                # Make API call
                response = self.client.chat.completions.create(
                    model=model,
                    messages=messages,
                    temperature=temperature
                )
                
                content = response.choices[0].message.content.strip()
                
                # Generate filename based on topic, model, temperature, and iteration
                # Clean model name (remove special characters)
                model_name = model.replace('-', '').replace('.', '')
                temp_str = str(temperature).replace('.', '')

                if runs == 1:
                    filename = f'{topic}_{model_name}_temp{temp_str}.html'
                else:
                    filename = f'{topic}_{model_name}_temp{temp_str}_v{i+1}.html'                
                # Save the response
                output_dir = Path('output')
                output_dir.mkdir(exist_ok=True)
                
                filepath = output_dir / filename
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(content)
                
                generated_files.append(str(filepath))
                print(f"✓ Generated: {filepath}")
                
            except Exception as e:
                print(f"✗ Error generating documentation (run {i+1}): {e}")
        
        return generated_files


Examples pulled from the User Docs website.

In [67]:
# Cell 5: Initialize Generator
# ============================================
print("🔧 Initializing documentation generator...")

try:
    generator = DocumentationGenerator(
        prompt_yaml_path='prompt.yaml',
        examples_dir='examples/'
    )
    print("✅ Generator initialized successfully")
    print(f"📁 Found {len(generator.examples)} examples")
except Exception as e:
    print(f"❌ Error initializing generator: {e}")

🔧 Initializing documentation generator...
✅ Generator initialized successfully
📁 Found 4 examples


In [68]:
# Cell 6: Generate Documentation for Single Topic
# ============================================
# This cell uses the TOPIC variable defined in Cell 1

# Build the query from template
query = QUERY_TEMPLATE.format(topic=TOPIC)

print(f"\n{'='*60}")
print(f"📝 Generating documentation for: {TOPIC}")
print(f"📋 Query: {query}")
print(f"🔄 Generating {RUNS} variations...")
print(f"{'='*60}\n")

# Track generation time
start_time = datetime.now()

# Generate the documentation
files = generator.generate_documentation(
    query=query,
    runs=RUNS,
    model=MODEL,
    temperature=TEMPERATURE
)

# Calculate elapsed time
elapsed = (datetime.now() - start_time).total_seconds()

print(f"\n{'='*60}")
print(f"✅ Generation complete!")
print(f"⏱️  Time taken: {elapsed:.2f} seconds")
print(f"📁 Generated {len(files)} files:")
for file in files:
    print(f"   - {file}")
print(f"{'='*60}")



📝 Generating documentation for: ParallelHDF5
📋 Query: Create a knowledge base article with regards to using ParallelHDF5 on the FASRC cluster, using the tone of graduate level Academic Computing documentation.
🔄 Generating 5 variations...

✓ Generated: output/parallelhdf5_gpt4omini_temp07_v1.html
✓ Generated: output/parallelhdf5_gpt4omini_temp07_v2.html
✓ Generated: output/parallelhdf5_gpt4omini_temp07_v3.html
✓ Generated: output/parallelhdf5_gpt4omini_temp07_v4.html
✓ Generated: output/parallelhdf5_gpt4omini_temp07_v5.html

✅ Generation complete!
⏱️  Time taken: 101.67 seconds
📁 Generated 5 files:
   - output/parallelhdf5_gpt4omini_temp07_v1.html
   - output/parallelhdf5_gpt4omini_temp07_v2.html
   - output/parallelhdf5_gpt4omini_temp07_v3.html
   - output/parallelhdf5_gpt4omini_temp07_v4.html
   - output/parallelhdf5_gpt4omini_temp07_v5.html


In [None]:
# Cell 7: Batch Generation for Multiple Topics (Optional)
# ============================================
# Uncomment and run this cell to generate docs for multiple topics at once

# TOPICS_LIST = ["PyTorch", "TensorFlow", "OpenMP", "MATLAB", "Conda"]
# 
# for topic in TOPICS_LIST:
#     query = QUERY_TEMPLATE.format(topic=topic)
#     print(f"\n📝 Generating documentation for: {topic}")
#     
#     files = generator.generate_documentation(
#         query=query,
#         runs=1,  # Just one version per topic for batch
#         model=MODEL,
#         temperature=TEMPERATURE
#     )
#     
#     print(f"✅ Generated: {', '.join(files)}")


In [None]:
# Cell 8: Preview Generated Files (Optional)
# ============================================
# This cell lets you preview the generated HTML files

from IPython.display import HTML, display
import os

# Get the most recently generated file
output_dir = Path('output')
if output_dir.exists():
    html_files = sorted(output_dir.glob(f'{TOPIC.lower()}*.html'))
    if html_files:
        latest_file = html_files[-1]
        print(f"📄 Previewing: {latest_file.name}")
        print("="*60)
        
        with open(latest_file, 'r') as f:
            content = f.read()
            # Show first 1000 characters
            print(content[:1000] + "..." if len(content) > 1000 else content)
            
        # Optionally display as rendered HTML (uncomment if in Jupyter)
        # display(HTML(content))
    else:
        print(f"No files found for topic: {TOPIC}")
else:
    print("Output directory not found!")


In [None]:
# Cell 9: Compare Multiple Versions (Optional)
# ============================================
# This cell helps you compare different generated versions

def compare_versions(topic: str):
    """Compare key differences between generated versions."""
    output_dir = Path('output')
    files = sorted(output_dir.glob(f'{topic.lower()}_*.html'))
    
    if len(files) < 2:
        print("Need at least 2 versions to compare")
        return
    
    print(f"📊 Comparing {len(files)} versions of {topic} documentation:\n")
    
    for i, file in enumerate(files, 1):
        with open(file, 'r') as f:
            content = f.read()
            
        # Extract some metrics
        word_count = len(content.split())
        line_count = len(content.splitlines())
        has_examples = 'example' in content.lower()
        has_code_blocks = '<code>' in content or '<pre>' in content
        
        print(f"Version {i} ({file.name}):")
        print(f"  - Words: {word_count}")
        print(f"  - Lines: {line_count}")
        print(f"  - Has examples: {'Yes' if has_examples else 'No'}")
        print(f"  - Has code blocks: {'Yes' if has_code_blocks else 'No'}")
        print()

# Run comparison
compare_versions(TOPIC)


In [None]:
# Cell 10: HTML Parser and Section Extractor
# ============================================
from bs4 import BeautifulSoup
import re
from typing import Dict, List, Tuple
import difflib

class DocumentAnalyzer:
    """Analyze and extract sections from HTML documentation."""
    
    def __init__(self, section_headers: List[str] = None):
        self.section_headers = section_headers or [
            'Description', 'Installation', 'Usage', 'Examples', 'References'
        ]
        
    def extract_sections(self, html_content: str) -> Dict[str, str]:
        """Extract sections from HTML content based on headers."""
        soup = BeautifulSoup(html_content, 'html.parser')
        sections = {}
        
        # Find all headers (h1, h2, h3, etc.)
        headers = soup.find_all(['h1', 'h2', 'h3', 'h4'])
        
        for i, header in enumerate(headers):
            header_text = header.get_text().strip()
            
            # Check if this header matches any of our target sections
            for section_name in self.section_headers:
                if section_name.lower() in header_text.lower():
                    # Extract content between this header and the next
                    content_parts = []
                    
                    # Get all siblings until the next header
                    for sibling in header.find_next_siblings():
                        if sibling.name in ['h1', 'h2', 'h3', 'h4']:
                            break
                        content_parts.append(str(sibling))
                    
                    sections[section_name] = '\n'.join(content_parts)
                    break
        
        return sections
    
    def calculate_section_score(self, section_content: str, section_name: str) -> float:
        """Calculate a quality score for a section."""
        if not section_content:
            return 0.0
        
        soup = BeautifulSoup(section_content, 'html.parser')
        text = soup.get_text().strip()
        
        # Base score on multiple factors
        score = 0.0
        
        # Length (not too short, not too long)
        word_count = len(text.split())
        if section_name == "Description":
            ideal_length = 150
            score += max(0, 1 - abs(word_count - ideal_length) / ideal_length) * 20
        else:
            score += min(word_count / 100, 1) * 20  # Longer is generally better for other sections
        
        # Code examples (for Installation, Usage, Examples)
        if section_name in ["Installation", "Usage", "Examples"]:
            code_blocks = soup.find_all(['code', 'pre'])
            score += min(len(code_blocks) * 10, 30)
        
        # Lists (good for organization)
        lists = soup.find_all(['ul', 'ol'])
        score += min(len(lists) * 5, 15)
        
        # Links (good for References)
        if section_name == "References":
            links = soup.find_all('a')
            score += min(len(links) * 10, 30)
        else:
            links = soup.find_all('a')
            score += min(len(links) * 2, 10)
        
        # Formatting variety (bold, italic, etc.)
        formatting_tags = soup.find_all(['strong', 'em', 'b', 'i'])
        score += min(len(formatting_tags) * 2, 10)
        
        # Clarity (sentences not too long)
        sentences = text.split('.')
        avg_sentence_length = sum(len(s.split()) for s in sentences) / max(len(sentences), 1)
        if 10 <= avg_sentence_length <= 25:
            score += 15
        
        return min(score, 100)  # Cap at 100

# Initialize the analyzer
analyzer = DocumentAnalyzer()
print("✅ Document analyzer initialized")


In [None]:
# Cell 11: Load and Analyze All Versions ( Arbitrary )
# ============================================
%pip install pandas 

from pathlib import Path
import pandas as pd

def load_and_analyze_versions(topic: str, model: str, temperature: str, num_versions: int = 5):
    """Load all versions and extract their sections."""
    
    output_dir = Path('output')
    all_sections = {}
    
    for version in range(1, num_versions + 1):
        # Construct filename
        if num_versions == 1:
            filename = f'{topic}_{model}_temp{temperature}.html'
        else:
            filename = f'{topic}_{model}_temp{temperature}_v{version}.html'
        
        filepath = output_dir / filename
        
        if not filepath.exists():
            print(f"⚠️  File not found: {filepath}")
            continue
            
        # Load and extract sections
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        
        sections = analyzer.extract_sections(content)
        all_sections[f'Version {version}'] = sections
        
        print(f"✅ Loaded Version {version}: {filename}")
        print(f"   Found sections: {', '.join(sections.keys())}")
    
    return all_sections

# Load all versions
all_sections = load_and_analyze_versions(
    topic=TOPIC.lower(),
    model=MODEL.replace('-', '').replace('.', ''),
    temperature=str(TEMPERATURE).replace('.', ''),
    num_versions=RUNS
)

print(f"\n📊 Loaded {len(all_sections)} versions for analysis")


In [None]:
# Cell 12: Score and Compare Sections ( Arbitrary )
# ============================================
import pandas as pd

def analyze_sections(all_sections: Dict[str, Dict[str, str]]) -> pd.DataFrame:
    """Analyze and score all sections across versions."""
    
    scores_data = []
    
    for section_name in analyzer.section_headers:
        section_scores = {}
        
        for version, sections in all_sections.items():
            if section_name in sections:
                score = analyzer.calculate_section_score(
                    sections[section_name], 
                    section_name
                )
                section_scores[version] = score
            else:
                section_scores[version] = 0
        
        # Find best version for this section
        if section_scores:
            best_version = max(section_scores, key=section_scores.get)
            best_score = section_scores[best_version]
        else:
            best_version = "N/A"
            best_score = 0
        
        scores_data.append({
            'Section': section_name,
            **section_scores,
            'Best Version': best_version,
            'Best Score': best_score
        })
    
    return pd.DataFrame(scores_data)

# Analyze all sections
scores_df = analyze_sections(all_sections)

print("📊 Section Analysis Results:")
print("="*80)
print(scores_df.to_string(index=False))
print("="*80)

# Show summary of best versions
print("\n🏆 Best Version for Each Section:")
for _, row in scores_df.iterrows():
    print(f"   {row['Section']}: {row['Best Version']} (Score: {row['Best Score']:.1f})")

In [83]:
# Cell 14: Compile Best-of Document
# ============================================
def compile_best_document(all_sections: Dict[str, Dict[str, str]], 
                         scores_data: pd.DataFrame,
                         manual_overrides: Dict[str, str] = None) -> str:
    """Compile the best sections into a final document."""
    
    # Allow manual overrides if needed
    manual_overrides = manual_overrides or {}
    
    # Start building the final HTML
    html_parts = [
        '<!DOCTYPE html>',
        '<html lang="en">',
        '<head>',
        '<meta charset="UTF-8">',
        '<meta name="viewport" content="width=device-width, initial-scale=1.0">',
        f'<title>{TOPIC} Documentation - Best Compilation</title>',
        '<style>',
        'body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; ',
        '       line-height: 1.6; max-width: 900px; margin: 0 auto; padding: 20px; }',
        'h1, h2, h3 { color: #2c3e50; }',
        'code { background-color: #f4f4f4; padding: 2px 4px; border-radius: 3px; }',
        'pre { background-color: #f4f4f4; padding: 15px; border-radius: 5px; overflow-x: auto; }',
        '.metadata { background-color: #e8f4f8; padding: 15px; border-radius: 5px; ',
        '            margin-bottom: 30px; font-size: 0.9em; }',
        '.section { margin-bottom: 40px; }',
        '.version-note { color: #7f8c8d; font-size: 0.85em; font-style: italic; }',
        '</style>',
        '</head>',
        '<body>',
        f'<h1>{TOPIC} Documentation</h1>',
        '<div class="metadata">',
        f'<strong>Compiled from best sections across {len(all_sections)} versions</strong><br>',
        f'Generated using: {MODEL} (Temperature: {TEMPERATURE})<br>',
        f'Compilation date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
        '</div>'
    ]
    
    # Add each section
    for _, row in scores_df.iterrows():
        section_name = row['Section']
        best_version = manual_overrides.get(section_name, row['Best Version'])
        
        if best_version != "N/A" and best_version in all_sections:
            if section_name in all_sections[best_version]:
                html_parts.append(f'<div class="section">')
                html_parts.append(f'<h2>{section_name}</h2>')
                html_parts.append(f'<span class="version-note">From {best_version}</span>')
                html_parts.append(all_sections[best_version][section_name])
                html_parts.append('</div>')
    
    # Close HTML
    html_parts.extend([
        '<div class="metadata" style="margin-top: 50px;">',
        '<strong>Section Sources:</strong><br>',
    ])
    
    # Add section source summary
    for _, row in scores_df.iterrows():
        section_name = row['Section']
        best_version = manual_overrides.get(section_name, row['Best Version'])
        score = row['Best Score']
        html_parts.append(f'{section_name}: {best_version} (Score: {score:.1f})<br>')
    
    html_parts.extend([
        '</div>',
        '</body>',
        '</html>'
    ])
    
    return '\n'.join(html_parts)



In [None]:
# Cell 15: Manual Override Option (if needed)
# ============================================
# If you disagree with the automatic selection, you can manually override

# Example: Force specific versions for certain sections
# manual_overrides = {
#     "Description": "Version 2",  # Use Version 2's description instead
#     "Examples": "Version 4"      # Use Version 4's examples instead
# }
# 
# # Recompile with overrides
# best_document_html = compile_best_document(all_sections, scores_data, manual_overrides)
# output_path = Path('output') / f'{TOPIC.lower()}_best_compilation_manual.html'
# with open(output_path, 'w', encoding='utf-8') as f:
#     f.write(best_document_html)
# print(f"✅ Manual compilation saved to: {output_path}")


In [None]:
# Cell 16: Generate Comparison Report ( Arbitrary )
# ============================================
def generate_analysis_report():
    """Generate a detailed analysis report of the compilation process."""
    
    report = []
    report.append(f"# Documentation Analysis Report for {TOPIC}")
    report.append(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append(f"Model: {MODEL}, Temperature: {TEMPERATURE}")
    report.append(f"Analyzed {len(all_sections)} versions\n")
    
    report.append("## Section Scores\n")
    report.append("| Section | " + " | ".join(all_sections.keys()) + " | Best Version | Best Score |")
    report.append("|---------|" + "---|" * (len(all_sections) + 2))
    
    for row in scores_data:
        line = f"| {row['Section']} | "
        for version in all_sections.keys():
            score = row.get(version, 0)
            line += f"{score:.1f} | "
        line += f"{row['Best Version']} | {row['Best Score']:.1f} |"
        report.append(line)
    
    report.append("\n## Key Findings\n")
    
    # Find most consistent version
    version_totals = {}
    for version in all_sections.keys():
        total = sum(row.get(version, 0) for row in scores_data)
        version_totals[version] = total
    
    if version_totals:
        best_overall = max(version_totals, key=version_totals.get)
        report.append(f"- **Best Overall Version**: {best_overall} (Total Score: {version_totals[best_overall]:.1f})")
    
    # Find sections with high variance
    report.append("\n### Section Quality Variance")
    for row in scores_data:
        version_scores = [row.get(v, 0) for v in all_sections.keys()]
        if version_scores:
            variance = max(version_scores) - min(version_scores)
            if variance > 20:
                report.append(f"- **{row['Section']}**: High variance ({variance:.1f} points) - quality varies significantly between versions")
    
    report.append("\n## Recommendations")
    report.append("- Review sections with high variance manually")
    report.append("- Consider regenerating sections with scores below 50")
    report.append(f"- The compiled document uses the best scoring section from each category")
    
    return '\n'.join(report)

# Generate and save the report
report_content = generate_analysis_report()
report_path = Path('output') / f'{TOPIC.lower()}_analysis_report.md'
with open(report_path, 'w', encoding='utf-8') as f:
    f.write(report_content)

print(f"📄 Analysis report saved to: {report_path}")
print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)
print(f"✅ Best compilation: output/{TOPIC.lower()}_best_compilation.html")
print(f"📊 Analysis report: {report_path}")
print(f"📈 View comparison: Open comparison.html in browser")

In [69]:
# Cell 17: GPT-Based Quality Evaluation
# ============================================
import json
from typing import Dict, List, Tuple
import asyncio
import time

class GPTQualityEvaluator:
    """Evaluate documentation quality using GPT for subjective metrics."""
    
    def __init__(self, client, model='gpt-4'):
        self.client = client
        self.model = model
        
    def create_evaluation_prompt(self, section_content: str, section_name: str, 
                               topic: str, criteria: str) -> str:
        """Create a prompt for evaluating specific quality criteria."""
        
        prompts = {
            "technical_accuracy": f"""
Evaluate the technical accuracy of this {section_name} section about {topic}.
Consider:
- Are the commands, code examples, and technical details correct?
- Are version numbers, dependencies, and requirements accurate?
- Are there any outdated or incorrect technical statements?
- Would following these instructions actually work?

Section content:
{section_content}

Provide a score from 0-100 and a brief explanation (2-3 sentences).
Format: {{"score": NUMBER, "explanation": "..."}}
""",
            
            "writing_style": f"""
Evaluate the writing style and tone of this {section_name} section for academic/research computing documentation.
Consider:
- Is the tone appropriately professional and academic?
- Is it clear and accessible for graduate-level users?
- Does it avoid being too casual or too dense?
- Is the language consistent and well-structured?

Section content:
{section_content}

Provide a score from 0-100 and a brief explanation (2-3 sentences).
Format: {{"score": NUMBER, "explanation": "..."}}
""",
            
            "completeness": f"""
Evaluate the completeness of this {section_name} section about {topic}.
Consider:
- Does it cover all essential information for this section type?
- Are there important details or steps missing?
- For {section_name}, what key elements should be present?
- Does it answer the questions users would typically have?

Section content:
{section_content}

Provide a score from 0-100 and a brief explanation (2-3 sentences).
Format: {{"score": NUMBER, "explanation": "..."}}
"""
        }
        
        return prompts.get(criteria, "")
    
    def parse_gpt_response(self, response: str) -> Tuple[float, str]:
        """Parse the GPT response to extract score and explanation."""
        try:
            # Try to parse as JSON first
            result = json.loads(response)
            return result['score'], result['explanation']
        except:
            # Fallback: extract number and text
            import re
            score_match = re.search(r'\b(\d+)\b', response)
            score = float(score_match.group(1)) if score_match else 50.0
            
            # Extract explanation (everything after the score)
            explanation = response.split(str(int(score)), 1)[-1].strip()
            return score, explanation
    
    async def evaluate_section_async(self, section_content: str, section_name: str, 
                                   topic: str, criteria: str) -> Dict:
        """Evaluate a single section on a single criterion asynchronously."""
        if not section_content.strip():
            return {
                'criteria': criteria,
                'score': 0,
                'explanation': 'Section is empty'
            }
        
        # Truncate very long sections to stay within token limits
        max_chars = 3000
        if len(section_content) > max_chars:
            section_content = section_content[:max_chars] + "... [truncated]"
        
        prompt = self.create_evaluation_prompt(section_content, section_name, topic, criteria)
        
        try:
            response = await asyncio.to_thread(
                self.client.chat.completions.create,
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are an expert technical documentation reviewer."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,  # Lower temperature for more consistent evaluation
                max_tokens=150
            )
            
            score, explanation = self.parse_gpt_response(response.choices[0].message.content)
            
            return {
                'criteria': criteria,
                'score': score,
                'explanation': explanation
            }
            
        except Exception as e:
            return {
                'criteria': criteria,
                'score': 0,
                'explanation': f'Error during evaluation: {str(e)}'
            }
    
    def evaluate_section(self, section_content: str, section_name: str, 
                        topic: str, criteria: List[str] = None) -> Dict[str, Dict]:
        """Evaluate a section on multiple criteria."""
        if criteria is None:
            criteria = ['technical_accuracy', 'writing_style', 'completeness']
        
        results = {}
        
        for criterion in criteria:
            if not section_content.strip():
                results[criterion] = {
                    'score': 0,
                    'explanation': 'Section is empty'
                }
                continue
            
            # Truncate very long sections
            max_chars = 3000
            content = section_content[:max_chars] + "..." if len(section_content) > max_chars else section_content
            
            prompt = self.create_evaluation_prompt(content, section_name, topic, criterion)
            
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "You are an expert technical documentation reviewer."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.3,
                    max_tokens=150
                )
                
                score, explanation = self.parse_gpt_response(response.choices[0].message.content)
                results[criterion] = {
                    'score': score,
                    'explanation': explanation
                }
                
            except Exception as e:
                results[criterion] = {
                    'score': 0,
                    'explanation': f'Error: {str(e)}'
                }
            
            # Rate limiting
            time.sleep(0.5)
        
        return results

# Initialize the GPT evaluator
gpt_evaluator = GPTQualityEvaluator(generator.client, model=MODEL)
print("✅ GPT Quality Evaluator initialized")


✅ GPT Quality Evaluator initialized


In [None]:

# Cell 18: Evaluate All Versions with GPT
# ============================================
def evaluate_all_versions_with_gpt(all_sections: Dict, topic: str, 
                                  sections_to_evaluate: List[str] = None,
                                  sample_versions: int = None) -> Dict:
    """Evaluate all versions using GPT for quality metrics."""
    
    if sections_to_evaluate is None:
        sections_to_evaluate = ['Description', 'Installation', 'Usage']
    
    # Option to sample fewer versions to reduce API calls
    versions_to_eval = list(all_sections.keys())
    if sample_versions and sample_versions < len(versions_to_eval):
        import random
        versions_to_eval = random.sample(versions_to_eval, sample_versions)
    
    gpt_scores = {}
    total_evaluations = len(sections_to_evaluate) * len(versions_to_eval)
    current_eval = 0
    
    print(f"\n🤖 Starting GPT evaluation of {total_evaluations} section-version pairs...")
    print(f"   Sections: {', '.join(sections_to_evaluate)}")
    print(f"   Versions: {', '.join(versions_to_eval)}\n")
    
    for section_name in sections_to_evaluate:
        gpt_scores[section_name] = {}
        
        for version in versions_to_eval:
            current_eval += 1
            print(f"   [{current_eval}/{total_evaluations}] Evaluating {section_name} - {version}...", end='', flush=True)
            
            if section_name in all_sections[version]:
                section_html = all_sections[version][section_name]
                # Convert HTML to text for evaluation
                soup = BeautifulSoup(section_html, 'html.parser')
                section_text = soup.get_text().strip()
                
                # Evaluate with GPT
                results = gpt_evaluator.evaluate_section(
                    section_text, 
                    section_name,
                    topic,
                    ['technical_accuracy', 'writing_style', 'completeness']
                )
                
                gpt_scores[section_name][version] = results
                
                # Calculate composite score
                composite = sum(r['score'] for r in results.values()) / len(results)
                print(f" ✓ (Composite: {composite:.1f})")
            else:
                gpt_scores[section_name][version] = {
                    'technical_accuracy': {'score': 0, 'explanation': 'Section not found'},
                    'writing_style': {'score': 0, 'explanation': 'Section not found'},
                    'completeness': {'score': 0, 'explanation': 'Section not found'}
                }
                print(" ✗ (Not found)")
    
    return gpt_scores

# Run GPT evaluation (with sampling to reduce costs)
gpt_evaluation_results = evaluate_all_versions_with_gpt(
    all_sections, 
    topic=TOPIC,
    sections_to_evaluate=['Description', 'Installation', 'Usage', 'Examples'],  # Limit sections
    sample_versions=5  # Only evaluate 3 versions instead of all 5 to save API calls
)



🤖 Starting GPT evaluation of 9 section-version pairs...
   Sections: Description, Installation, Examples
   Versions: Version 3, Version 5, Version 4

   [1/9] Evaluating Description - Version 3... ✓ (Composite: 83.3)
   [2/9] Evaluating Description - Version 5... ✓ (Composite: 80.0)
   [3/9] Evaluating Description - Version 4... ✓ (Composite: 83.3)
   [4/9] Evaluating Installation - Version 3... ✓ (Composite: 80.0)
   [5/9] Evaluating Installation - Version 5... ✓ (Composite: 80.0)
   [6/9] Evaluating Installation - Version 4... ✓ (Composite: 80.0)
   [7/9] Evaluating Examples - Version 3... ✓ (Composite: 61.7)
   [8/9] Evaluating Examples - Version 5... ✓ (Composite: 63.3)
   [9/9] Evaluating Examples - Version 4... ✓ (Composite: 71.7)


In [71]:
# Cell 19: Combine GPT and Algorithmic Scores
# ============================================
def create_combined_scoring_report(algorithmic_scores: pd.DataFrame, 
                                  gpt_scores: Dict,
                                  weight_algorithmic: float = 0.4,
                                  weight_gpt: float = 0.6) -> pd.DataFrame:
    """Combine algorithmic and GPT scores with weighting."""
    
    combined_data = []
    
    for _, row in algorithmic_scores.iterrows():
        section_name = row['Section']
        
        if section_name not in gpt_scores:
            # Use only algorithmic score if no GPT evaluation
            combined_data.append({
                'Section': section_name,
                'Best Version (Algorithm)': row['Best Version'],
                'Algorithm Score': row['Best Score'],
                'Best Version (GPT)': 'N/A',
                'GPT Composite': 0,
                'Combined Score': row['Best Score'] * weight_algorithmic,
                'Final Best Version': row['Best Version']
            })
            continue
        
        # Find best version according to GPT
        gpt_version_scores = {}
        for version, results in gpt_scores[section_name].items():
            composite = sum(r['score'] for r in results.values()) / len(results)
            gpt_version_scores[version] = composite
        
        best_gpt_version = max(gpt_version_scores, key=gpt_version_scores.get) if gpt_version_scores else 'N/A'
        best_gpt_score = gpt_version_scores.get(best_gpt_version, 0)
        
        # Calculate combined scores for each version
        version_combined_scores = {}
        for version in all_sections.keys():
            algo_score = row.get(version, 0)
            gpt_score = gpt_version_scores.get(version, 0)
            combined = (algo_score * weight_algorithmic) + (gpt_score * weight_gpt)
            version_combined_scores[version] = combined
        
        # Find best version by combined score
        best_combined_version = max(version_combined_scores, key=version_combined_scores.get)
        best_combined_score = version_combined_scores[best_combined_version]
        
        combined_data.append({
            'Section': section_name,
            'Best Version (Algorithm)': row['Best Version'],
            'Algorithm Score': row['Best Score'],
            'Best Version (GPT)': best_gpt_version,
            'GPT Composite': best_gpt_score,
            'Combined Score': best_combined_score,
            'Final Best Version': best_combined_version
        })
    
    return pd.DataFrame(combined_data)

# Create combined report
combined_scores_df = create_combined_scoring_report(scores_df, gpt_evaluation_results)

print("\n📊 Combined Scoring Results:")
print("="*80)
print(combined_scores_df.to_string(index=False))
print("="*80)

print("\n🏆 Final Best Versions (Combined Scoring):")
for _, row in combined_scores_df.iterrows():
    print(f"   {row['Section']}: {row['Final Best Version']} (Score: {row['Combined Score']:.1f})")




📊 Combined Scoring Results:
     Section Best Version (Algorithm)  Algorithm Score Best Version (GPT)  GPT Composite  Combined Score Final Best Version
 Description                Version 4             29.0          Version 3      83.333333           61.60          Version 4
Installation                Version 5             58.2          Version 3      80.000000           71.28          Version 5
       Usage                Version 4             80.0                N/A       0.000000           32.00          Version 4
    Examples                Version 4             52.8          Version 4      71.666667           64.12          Version 4
  References                Version 2             54.8                N/A       0.000000           21.92          Version 2

🏆 Final Best Versions (Combined Scoring):
   Description: Version 4 (Score: 61.6)
   Installation: Version 5 (Score: 71.3)
   Usage: Version 4 (Score: 32.0)
   Examples: Version 4 (Score: 64.1)
   References: Version 2 (Score:

In [77]:
%pip install tabulate
# Cell 20: Generate Detailed GPT Evaluation Report
# ============================================
def generate_gpt_evaluation_report(gpt_scores: Dict, combined_scores_df: pd.DataFrame) -> str:
    """Generate a detailed report of GPT evaluations."""
    
    report = []
    report.append(f"# GPT Quality Evaluation Report for {TOPIC}")
    report.append(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append(f"Model used for evaluation: {MODEL}\n")
    
    # Summary table
    report.append("## Evaluation Summary\n")
    report.append(combined_scores_df.to_markdown(index=False))
    
    # Detailed evaluations by section
    report.append("\n## Detailed GPT Evaluations\n")
    
    for section_name, version_results in gpt_scores.items():
        report.append(f"\n### {section_name}\n")
        
        for version, criteria_results in version_results.items():
            report.append(f"\n**{version}:**\n")
            
            for criteria, result in criteria_results.items():
                report.append(f"- **{criteria.replace('_', ' ').title()}**: {result['score']:.1f}/100")
                report.append(f"  - {result['explanation']}")
            
            composite = sum(r['score'] for r in criteria_results.values()) / len(criteria_results)
            report.append(f"- **Composite Score**: {composite:.1f}/100\n")
    
    # Key insights
    report.append("\n## Key Insights\n")
    
    # Find sections where GPT and algorithm disagree
    disagreements = []
    for _, row in combined_scores_df.iterrows():
        if row['Best Version (Algorithm)'] != row['Final Best Version']:
            disagreements.append(f"- **{row['Section']}**: Algorithm chose {row['Best Version (Algorithm)']} "
                               f"but combined scoring chose {row['Final Best Version']}")
    
    if disagreements:
        report.append("### Algorithm vs GPT Disagreements\n")
        report.extend(disagreements)
    
    # Technical accuracy concerns
    report.append("\n### Technical Accuracy Concerns\n")
    for section_name, version_results in gpt_scores.items():
        for version, criteria_results in version_results.items():
            if criteria_results['technical_accuracy']['score'] < 70:
                report.append(f"- **{section_name} ({version})**: "
                            f"{criteria_results['technical_accuracy']['explanation']}")
    
    return '\n'.join(report)

# Generate and save GPT evaluation report
gpt_report = generate_gpt_evaluation_report(gpt_evaluation_results, combined_scores_df)
gpt_report_path = Path('output') / f'{TOPIC.lower()}_gpt_evaluation_report.md'
with open(gpt_report_path, 'w', encoding='utf-8') as f:
    f.write(gpt_report)

print(f"\n📄 GPT evaluation report saved to: {gpt_report_path}")


Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.

📄 GPT evaluation report saved to: output/parallelhdf5_gpt_evaluation_report.md


In [84]:
# Cell 21: Create Final Best Document with GPT Insights
# ============================================
# Use the combined scoring to create the final document
final_overrides = {
    row['Section']: row['Final Best Version'] 
    for _, row in combined_scores_df.iterrows()
}

# Compile with GPT-informed selections
final_best_html = compile_best_document(all_sections, scores_df, final_overrides)

# Save the GPT-enhanced compilation
final_output_path = Path('output') / f'{TOPIC.lower()}_best_compilation_gpt_enhanced.html'
with open(final_output_path, 'w', encoding='utf-8') as f:
    f.write(final_best_html)

print(f"✅ GPT-enhanced compilation saved to: {final_output_path}")
print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)
print(f"📊 Algorithm + GPT best compilation: {final_output_path}")
print(f"📈 GPT evaluation report: {gpt_report_path}")



✅ GPT-enhanced compilation saved to: output/parallelhdf5_best_compilation_gpt_enhanced.html

EVALUATION COMPLETE
📊 Algorithm + GPT best compilation: output/parallelhdf5_best_compilation_gpt_enhanced.html
📈 GPT evaluation report: output/parallelhdf5_gpt_evaluation_report.md


In [86]:
# Cell 22: Cost Estimation Helper
# ============================================
def estimate_api_costs(num_sections: int, num_versions: int, 
                      avg_section_length: int = 500,
                      model: str = 'gpt-4',
                      criteria_per_section: int = 3):
    """Estimate API costs for full evaluation."""
    
    # Approximate tokens
    tokens_per_eval = avg_section_length + 200  # content + prompt
    total_evals = num_sections * num_versions * criteria_per_section
    total_tokens = total_evals * tokens_per_eval
    
    # Pricing (as of 2024 - update as needed)
    prices = {
        'gpt-4': {'input': 0.03, 'output': 0.06},  # per 1K tokens
        'gpt-3.5-turbo': {'input': 0.0005, 'output': 0.0015}
    }
    
    if model in prices:
        # Assume 80% input, 20% output
        input_cost = (total_tokens * 0.8 / 1000) * prices[model]['input']
        output_cost = (total_tokens * 0.2 / 1000) * prices[model]['output']
        total_cost = input_cost + output_cost
        
        print(f"\n💰 Estimated API Costs:")
        print(f"   Model: {model}")
        print(f"   Total evaluations: {total_evals}")
        print(f"   Estimated tokens: {total_tokens:,}")
        print(f"   Estimated cost: ${total_cost:.2f}")
        print(f"\n   Tip: Use gpt-3.5-turbo to reduce costs by ~95%")
    
# Estimate costs for full evaluation
estimate_api_costs(
    num_sections=len(analyzer.section_headers),
    num_versions=len(all_sections),
    model=MODEL
)