In [None]:
import importlib
import sys
import requests
import time
import json
import pandas as pd

sys.path.append('..')
import ash

importlib.reload(ash)

from ash.server import ASHModel
m = ASHModel('../models/qwen2.5-coder-3b-instruct-q4_k_m.gguf')
m.load()

def exec_test_cases(test_cases):
    results = []
    total_points = 0

    # Test each case
    for i, test_case in enumerate(test_cases, 1):
        query = test_case['query']
        expected = test_case['expected']
        points = test_case['points']

        total_points += points

        try:
            # Start timer for this test case
            start_time = time.time()

            # Call AI model
            ai_response = m.generate_command(query)

            # End timer and calculate duration
            end_time = time.time()
            duration = end_time - start_time

            # Simple comparison - check if expected command is in AI response
            # This handles cases where AI might add arguments or variations
            if isinstance(ai_response, str):
                ai_response = ai_response.strip('"')
                ai_response = ai_response.strip('\'')

            # For each expected command, check if all its tokens (split by space) are present in the ai_response, regardless of order.
            def command_tokens_match(expected_cmd, ai_cmd):
                # Remove extra quotes and normalize whitespace
                expected_tokens = expected_cmd.replace('"', '').replace("'", '').split()
                ai_tokens = ai_cmd.replace('"', '').replace("'", '').split()
                # All expected tokens must be present in ai_response tokens (order doesn't matter)
                return all(token in ai_tokens for token in expected_tokens)

            is_correct = any(command_tokens_match(str(e), ai_response) for e in expected)

            results.append({
                'query': query,
                'ai_response': ai_response,
                'pass/fail': 'PASS' if is_correct else 'FAIL',
                'expected': expected,
                'elapsed': round(duration, 3)
            })

            #print(f"Test {i}: {query} | {expected} | {ai_response} - {duration:.3f}s - {'PASS' if is_correct else 'FAIL'}")

        except Exception as e:
            print(f"✗ ERROR: {e}")

        # Small delay to avoid overwhelming the API
        time.sleep(0.05)

    return results

🤖 Loading local model: ../models/qwen2.5-coder-3b-instruct-q4_k_m.gguf


llama_context: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_set_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_c4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f16                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64 

✅ Local model loaded successfully in 1.90 seconds!
🔥 Warming up model...
✅ Model warmed up in 5.32 seconds!


In [22]:
# Load test cases from JSON file
with open('./test_data/test_cases_basic.json', 'r') as f:
    test_data = json.load(f)
    test_cases = test_data['tests']

# Initialize results list
results = exec_test_cases(test_cases)

# Convert results to pandas dataframe
results_df = pd.DataFrame(results)

avg_inference_time = results_df['elapsed'].mean()
print(f"\nAverage inference time: {avg_inference_time:.3f} seconds")
pass_percentage = ((results_df['pass/fail'] == 'PASS').sum() / len(results_df)) * 100
print(f"PASS percentage: {pass_percentage:.2f}%")

display(results_df)



Average inference time: 30.939 seconds
PASS percentage: 92.11%


Unnamed: 0,query,ai_response,expected,pass/fail,points,elapsed
0,home,cd ~,"[cd ~, cd $HOME]",PASS,3,4.933
1,go home,cd ~,"[cd ~, cd $HOME]",PASS,3,0.585
2,change directory to home,cd ~,"[cd ~, cd $HOME]",PASS,3,0.561
3,change to home directory,cd ~,"[cd ~, cd $HOME]",PASS,3,0.576
4,create a new file called a,touch a,"[touch a, > a]",PASS,3,0.906
5,move file a.txt to b.txt,mv a.txt b.txt,[mv a.txt b.txt],PASS,3,12.713
6,move a.txt to upper directory,mv a.txt ..,[mv a.txt ..],PASS,3,6.706
7,copy a to upper directory,mv a ..,[cp a ..],FAIL,0,0.863
8,list files,ls,"[ls, ls -l, ls -la]",PASS,3,6.597
9,show current directory,pwd,[pwd],PASS,3,8.682
