In [42]:
import importlib
import sys
import requests
import time
import json
import pandas as pd

sys.path.append('..')
import ash

importlib.reload(ash)

from ash.server import ASHModel
m = ASHModel('../models/qwen2.5-coder-3b-instruct-q4_k_m.gguf')
m.load()

def exec_test_cases(test_cases):
    results = []
    total_points = 0

    # Test each case
    for i, test_case in enumerate(test_cases, 1):
        query = test_case['query']
        expected = test_case['expected']
        points = test_case['points']

        total_points += points

        try:
            # Start timer for this test case
            start_time = time.time()

            # Call AI model
            ai_response = m.generate_command(query)

            # End timer and calculate duration
            end_time = time.time()
            duration = end_time - start_time

            # Simple comparison - check if expected command is in AI response
            # This handles cases where AI might add arguments or variations
            if isinstance(ai_response, str):
                ai_response = ai_response.strip('"')
                ai_response = ai_response.strip('\'')

            # For each expected command, check if all its tokens (split by space) are present in the ai_response, regardless of order.
            def command_tokens_match(expected_cmd, ai_cmd):
                # Remove extra quotes and normalize whitespace
                expected_tokens = expected_cmd.replace('"', '').replace("'", '').split()
                ai_tokens = ai_cmd.replace('"', '').replace("'", '').split()
                # All expected tokens must be present in ai_response tokens (order doesn't matter)
                return all(token in ai_tokens for token in expected_tokens)

            is_correct = any(command_tokens_match(str(e), ai_response) for e in expected)

            results.append({
                'query': query,
                'ai_response': ai_response,
                'pass/fail': 'PASS' if is_correct else 'FAIL',
                'expected': expected,
                'elapsed': round(duration, 3)
            })

            #print(f"Test {i}: {query} | {expected} | {ai_response} - {duration:.3f}s - {'PASS' if is_correct else 'FAIL'}")

        except Exception as e:
            print(f"✗ ERROR: {e}")

        # Small delay to avoid overwhelming the API
        time.sleep(0.05)

    return pd.DataFrame(results)

🤖 Loading local model: ../models/qwen2.5-coder-3b-instruct-q4_k_m.gguf


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_set_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_c4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f16                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64 

✅ Local model loaded successfully in 2.06 seconds!
🔥 Warming up model...
✅ Model warmed up in 7.87 seconds!


In [48]:
test_case = './brew.json'
with open(f'test_data/{test_case}') as f:
    test_cases = json.load(f)

results = exec_test_cases(test_cases)

avg_inference_time = results['elapsed'].mean()
print(f"\nAverage inference time: {avg_inference_time:.3f} seconds")
pass_percentage = ((results['pass/fail'] == 'PASS').sum() / len(results)) * 100
print(f"PASS percentage: {pass_percentage:.2f}%")
display(results)



Average inference time: 2.314 seconds
PASS percentage: 83.33%


Unnamed: 0,query,ai_response,pass/fail,expected,elapsed
0,install python3.12 using brew,brew install python@3.12,PASS,"[brew install python3.12, brew install python@...",8.976
1,search curl using brew,brew search curl,PASS,"[brew search, brew search package_name]",1.027
2,list installed packages for brew,brew list,PASS,"[brew list, brew list --formula, brew list --c...",1.079
3,upgrade git using brew,brew upgrade git,PASS,"[brew upgrade, brew upgrade package_name]",0.906
4,tapped brew repositories,brew update,FAIL,"[brew tap, brew tap user/repo]",0.799
5,uninstall git using brew,brew uninstall git,PASS,[brew uninstall git],1.099
