# PC-AI Comprehensive LLM Evaluation Suite

This notebook drives a parameter sweep across PC-AI backends, captures system resource telemetry, and graphs performance/quality metrics. It also includes optional hooks for standard LLM benchmarks via `lm-evaluation-harness`.


In [ ]:
import os, json, time, datetime, subprocess, sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

PROJECT_ROOT = Path(r'C:\\Users\\david\\PC_AI')
REPORTS_DIR = PROJECT_ROOT / 'Reports' / 'llm-eval' / datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
print('Reports:', REPORTS_DIR)

def find_powershell():
    for exe in ['powershell', 'pwsh']:
        try:
            subprocess.run([exe, '-NoProfile', '-Command', '$PSVersionTable.PSVersion'], check=True, capture_output=True, text=True)
            return exe
        except Exception:
            continue
    raise RuntimeError('PowerShell not found')

POWERSHELL = find_powershell()
print('Using PowerShell:', POWERSHELL)

def run_ps(cmd, check=True):
    result = subprocess.run([POWERSHELL, '-NoProfile', '-ExecutionPolicy', 'Bypass', '-Command', cmd],
                            capture_output=True, text=True)
    if check and result.returncode != 0:
        raise RuntimeError(f'PowerShell failed: {result.stderr}\nSTDOUT:\n{result.stdout}')
    return result


## Load PC-AI modules
This ensures the evaluation and performance modules are available for native and compiled-backend tests.


In [ ]:
ps = f'''
Import-Module '{PROJECT_ROOT / 'Modules' / 'PcaiInference.psd1'}' -Force -ErrorAction SilentlyContinue
Import-Module '{PROJECT_ROOT / 'Modules' / 'PC-AI.Evaluation' / 'PC-AI.Evaluation.psd1'}' -Force
Import-Module '{PROJECT_ROOT / 'Modules' / 'PC-AI.Performance' / 'PC-AI.Performance.psd1'}' -Force
Get-PcaiInferenceStatus | Format-List
'''
print(run_ps(ps).stdout)


## Model discovery
We auto-detect a test model. For llama.cpp we need a `.gguf`. For mistral.rs, a SafeTensors model is acceptable. Set `PCAI_TEST_MODEL` to override.


In [ ]:
def find_model_path():
    env_path = os.environ.get('PCAI_TEST_MODEL')
    if env_path and Path(env_path).exists():
        return Path(env_path)

    # Prefer local repo Models
    models_dir = PROJECT_ROOT / 'Models'
    gguf = list(models_dir.rglob('*.gguf'))
    if gguf:
        return gguf[0]

    # Try functiongemma safetensors
    gemma = models_dir / 'functiongemma-270m-it' / 'model.safetensors'
    if gemma.exists():
        return gemma

    # Ollama / LM Studio caches
    lm = Path(os.environ.get('LOCALAPPDATA','')) / 'lm-studio' / 'models'
    if lm.exists():
        candidates = list(lm.rglob('*.gguf'))
        if candidates:
            return candidates[0]

    ollama = Path.home() / '.ollama' / 'models' / 'blobs'
    if ollama.exists():
        candidates = list(ollama.rglob('*.gguf'))
        if candidates:
            return candidates[0]

    return None

MODEL_PATH = find_model_path()
print('MODEL_PATH:', MODEL_PATH)


## Parameter grid
Adjust the sweep parameters here to control run time and coverage.


In [ ]:
if MODEL_PATH is None:
    raise RuntimeError('No model found. Set PCAI_TEST_MODEL or add a gguf/safetensors model.')

backends = []
if str(MODEL_PATH).lower().endswith('.gguf'):
    backends.append('llamacpp-bin')
backends.append('mistralrs-bin')

temperatures = [0.1, 0.7, 1.0]
max_tokens = [64, 128, 256]
gpu_layers = [-1, 0]
datasets = ['diagnostic', 'general', 'safety']
max_test_cases = 5  # keep runs tight; increase for deeper evals

print('Backends:', backends)
print('Datasets:', datasets)


## Helper: system resource monitoring
Uses `Watch-SystemResources` from PC-AI.Performance in `Object` mode.


In [ ]:
def start_resource_monitor(output_csv: Path, duration_s: int, refresh_s: int = 1):
    ps = f'''
Import-Module '{PROJECT_ROOT / 'Modules' / 'PC-AI.Performance' / 'PC-AI.Performance.psd1'}' -Force
Watch-SystemResources -Duration {duration_s} -RefreshInterval {refresh_s} -OutputMode Object | Export-Csv -NoTypeInformation -Path '{output_csv}'
'''
    return subprocess.Popen([POWERSHELL, '-NoProfile', '-ExecutionPolicy', 'Bypass', '-Command', ps],
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)


## Run evaluation grid
Each run generates JSON metrics and a CSV with system telemetry.


In [ ]:
results = []
for backend in backends:
    for dataset in datasets:
        for temp in temperatures:
            for max_tok in max_tokens:
                for gpu in gpu_layers:
                    run_id = f"{backend}_{dataset}_t{temp}_mt{max_tok}_gpu{gpu}".replace('.', 'p')
                    out_json = REPORTS_DIR / f"{run_id}.json"
                    out_csv = REPORTS_DIR / f"{run_id}_resources.csv"

                    # Estimate duration budget for monitoring
                    duration = max(30, max_test_cases * 6)
                    monitor = start_resource_monitor(out_csv, duration_s=duration, refresh_s=1)

                    cmd = f"""& '{PROJECT_ROOT / 'Tests' / 'Evaluation' / 'Invoke-InferenceEvaluation.ps1'}' `
    -Backend {backend} `
    -ModelPath '{MODEL_PATH}' `
    -Dataset {dataset} `
    -MaxTokens {max_tok} `
    -Temperature {temp} `
    -GpuLayers {gpu} `
    -MaxTestCases {max_test_cases} `
    -OutputPath '{out_json}'
"""
                    print('Running:', run_id)
                    ps_result = run_ps(cmd, check=False)

                    # Wait for monitor to complete
                    try:
                        monitor.wait(timeout=duration + 10)
                    except subprocess.TimeoutExpired:
                        monitor.kill()

                    if ps_result.returncode != 0:
                        print('Run failed:', run_id, ps_result.stderr)
                        continue

                    if not out_json.exists():
                        print('Missing output:', out_json)
                        continue

                    payload = json.loads(out_json.read_text())
                    # Extract summary for this backend
                    summary = None
                    for item in payload.get('Results', []):
                        if item.get('Key') == backend:
                            summary = item.get('Value')
                            break
                    if summary is None:
                        summary = {}

                    results.append({
                        'run_id': run_id,
                        'backend': backend,
                        'dataset': dataset,
                        'temperature': temp,
                        'max_tokens': max_tok,
                        'gpu_layers': gpu,
                        'pass_rate': summary.get('PassRate'),
                        'avg_score': summary.get('AverageScore'),
                        'avg_latency_ms': summary.get('AverageLatency'),
                        'resources_csv': str(out_csv)
                    })

df = pd.DataFrame(results)
display(df.head())
df.to_csv(REPORTS_DIR / 'summary.csv', index=False)
print('Saved summary:', REPORTS_DIR / 'summary.csv')


## Plot performance + quality metrics


In [ ]:
if df.empty:
    raise RuntimeError('No results collected. Check logs in Reports for failures.')

plt.figure(figsize=(12, 5))
for backend in df['backend'].unique():
    subset = df[df['backend'] == backend]
    plt.plot(subset['avg_latency_ms'], label=f'{backend} latency')
plt.title('Average Latency per Run')
plt.xlabel('Run Index')
plt.ylabel('Latency (ms)')
plt.legend()
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 5))
for backend in df['backend'].unique():
    subset = df[df['backend'] == backend]
    plt.plot(subset['avg_score'], label=f'{backend} score')
plt.title('Average Quality Score per Run')
plt.xlabel('Run Index')
plt.ylabel('Score')
plt.legend()
plt.tight_layout()
plt.show()


## Plot system resource telemetry (CPU + memory)


In [ ]:
def plot_resources(csv_path):
    data = pd.read_csv(csv_path)
    if data.empty:
        return
    fig, ax1 = plt.subplots(figsize=(12, 5))
    ax1.plot(data['CpuPercent'], label='CPU %')
    ax1.set_ylabel('CPU %')
    ax2 = ax1.twinx()
    ax2.plot(data['MemoryPercent'], color='orange', label='Memory %')
    ax2.set_ylabel('Memory %')
    ax1.set_title(Path(csv_path).name)
    fig.legend(loc='upper right')
    plt.tight_layout()
    plt.show()

# Plot first few resource traces
for csv_path in df['resources_csv'].head(3):
    plot_resources(csv_path)


## Optional: Standard LLM benchmark suite (lm-evaluation-harness)
This step uses `lm-evaluation-harness` (EleutherAI) to run short subsets of common tasks (ARC Easy, HellaSwag, TruthfulQA). It is optional and will skip if installation fails.


In [ ]:
def ensure_lm_eval():
    try:
        import lm_eval  # noqa: F401
        return True
    except Exception:
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'lm-eval[openai]>=0.4.4'])
            return True
        except Exception as e:
            print('lm-eval install failed:', e)
            return False

if ensure_lm_eval():
    tasks = 'arc_easy,hellaswag,truthfulqa_mc'
    base_url = 'http://127.0.0.1:8080/v1'
    cmd = [
        sys.executable, '-m', 'lm_eval',
        '--model', 'openai-chat-completions',
        '--model_args', f'model=pcai-inference,api_base={base_url},api_key=sk-local',
        '--tasks', tasks,
        '--limit', '5',
        '--output_path', str(REPORTS_DIR / 'lm_eval_results.json')
    ]
    print('Running lm-eval:', ' '.join(cmd))
    subprocess.run(cmd, check=False)
else:
    print('lm-eval not available; skipping standard benchmark suite.')


## Optimization recommendation
We compute a simple composite score to pick a high-performance configuration.


In [ ]:
df_clean = df.dropna(subset=['avg_score', 'avg_latency_ms']).copy()
df_clean['score_norm'] = df_clean['avg_score']
df_clean['lat_norm'] = 1.0 - (df_clean['avg_latency_ms'] / df_clean['avg_latency_ms'].max())
df_clean['composite'] = (df_clean['score_norm'] * 0.6) + (df_clean['lat_norm'] * 0.4)
best = df_clean.sort_values('composite', ascending=False).head(5)
display(best)
