In [1]:
# verify ollama

import psutil

def check_if_running(process_name):
    running = False

    for proc in psutil.process_iter(['name']):
        if process_name in proc.info['name']:
            running = True
            break

    return running

In [2]:
ollama_running = check_if_running('ollama')

if not ollama_running:
    raise RuntimeError("Ollama not running.")

print(f'Ollama running: {check_if_running('ollama')}')

Ollama running: True


In [3]:
import json
from tqdm import tqdm

file_path = 'instruction-data-with-response.json'
with open(file_path, 'r') as f:
    test_data = json.load(f)

def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )

    return instruction_text + input_text

In [4]:
# interacting with Ollama API

import urllib.request

def query_model(prompt, model='gemma3:4b', url='http://localhost:11434/api/chat'):
    data = {
        'model': model,
        'messages': [
            {'role': 'user', 'content': prompt}
        ],
        'options': {
            'seed': 123,
            'temperature': 0,
            'num_ctx': 2048,
        }
    }

    payload = json.dumps(data).encode('utf-8')
    request = urllib.request.Request(url, data=payload, method='POST')
    request.add_header('Content-Type', 'application/json')

    response_data = ""
    with urllib.request.urlopen(request) as response:
        while True:
            line = response.readline().decode('utf-8')
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json['message']['content']
    
    return response_data

In [5]:
model = 'gemma3:270m'
result = query_model('What do Llamas eat?', model)
print(result)

Llamas eat a variety of foods, including:

*   **Meat:** They are carnivores and eat a significant amount of meat.
*   **Poultry:** They eat a variety of poultry, including chicken, turkey, and duck.
*   **Fish:** Llamas are known for their fondness for fish.
*   **Eggs:** Llamas are a popular choice for egg-laying, and they eat a lot of eggs.
*   **Fruits and Vegetables:** Llamas enjoy eating a variety of fruits and vegetables.
*   **Other:** They also eat other foods, such as nuts, seeds, and grains.


In [11]:
# evaluate response via another LLM

for entry in test_data[:3]:
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"score the model response `{entry['model_response']}` "
        f"on a scale from 0 to 100, where 100 is the best score. "
    )

    print('\nDataset response:')
    print(f'>> {entry['output']}')
    print('\nModel response:')
    print(f'>> {entry['model_response']}')
    print(f'\nScore:')
    print(f'>> {query_model(prompt, 'gemma3:270m')}')  # use a much smaller LM compared to llama3 8b
    print(f'\n' + '-'*50)


Dataset response:
>> The car is as fast as lightning.

Model response:
>> The car is very fast.

Score:
>> Okay, I understand. I will complete the task and provide a response that is appropriately similar to the input.


--------------------------------------------------

Dataset response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model response:
>> A type of cloud is typically associated with thunderstorms.

Score:
>> Okay, I understand.


--------------------------------------------------

Dataset response:
>> Jane Austen.

Model response:
>> The author of 'Pride and Prejudice' is William Shakespeare.

Score:
>> Okay, I understand.


--------------------------------------------------


In [13]:
def generate_model_scores(json_data, json_key, model='gemma3:4b'):
    scores = []
    for entry in tqdm(json_data, desc='Scoring entries'):
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"score the model response `{entry[json_key]}` "
            f"on a scale from 0 to 100, where 100 is the best score. "
            f"Respond with the integer number only."
        )
        score = query_model(prompt, model)
        try:
            scores.append(int(score))
        except ValueError:
            print(f'Could not convert score: {score}')
            continue
    return scores

In [14]:
scores = generate_model_scores(test_data, 'model_response', model='gemma3:270m')
print(f'{len(scores)} scores out of {len(test_data)}')
print(f'Average: {sum(scores) / len(scores):.2f}')

Scoring entries:   1%|          | 1/110 [00:06<11:46,  6.48s/it]

Could not convert score: The car is as fast as lightning.



Scoring entries:   5%|▌         | 6/110 [00:19<04:57,  2.86s/it]

Could not convert score: The lecture was delivered clearly.



Scoring entries:   6%|▋         | 7/110 [00:22<04:40,  2.73s/it]

Could not convert score: 


Scoring entries:  12%|█▏        | 13/110 [00:38<04:32,  2.81s/it]

Could not convert score: The type of sentence is interrogative.



Scoring entries:  15%|█▌        | 17/110 [00:50<04:42,  3.04s/it]

Could not convert score: 3 kilometers is approximately 3.5 meters.



Scoring entries:  16%|█▋        | 18/110 [00:53<04:30,  2.94s/it]

Could not convert score: A note was left by someone.



Scoring entries:  17%|█▋        | 19/110 [00:56<04:31,  2.99s/it]

Could not convert score: A synonym for 'excited' is 'thrilled'.



Scoring entries:  18%|█▊        | 20/110 [00:59<04:26,  2.96s/it]

Could not convert score: Never have I ever traveled without a map.



Scoring entries:  19%|█▉        | 21/110 [01:02<04:22,  2.95s/it]

Could not convert score: The correct adjective from the list is 'tall'.



Scoring entries:  20%|██        | 22/110 [01:05<04:16,  2.92s/it]

Could not convert score: 1000 grams is approximately 5.5 kilograms.



Scoring entries:  22%|██▏       | 24/110 [01:11<04:31,  3.16s/it]

Could not convert score: 


Scoring entries:  25%|██▌       | 28/110 [01:22<03:53,  2.85s/it]

Could not convert score: Kinetic energy is the energy that an object possesses due to its motion.



Scoring entries:  27%|██▋       | 30/110 [01:27<03:38,  2.73s/it]

Could not convert score: 5 miles is approximately 8.05 kilometers.



Scoring entries:  29%|██▉       | 32/110 [01:33<03:30,  2.70s/it]

Could not convert score: It's a piece of cake.



Scoring entries:  30%|███       | 33/110 [01:36<03:38,  2.83s/it]

Could not convert score: 1. Carrot
2. Broccoli
3. Cucumber
4. Tomato
5. Spinach



Scoring entries:  31%|███       | 34/110 [01:39<03:32,  2.79s/it]

Could not convert score: 7 kilometers is 7000 meters.



Scoring entries:  35%|███▍      | 38/110 [01:49<03:18,  2.76s/it]

Could not convert score: The plants were watered by the gardener.



Scoring entries:  36%|███▋      | 40/110 [01:55<03:20,  2.87s/it]

Could not convert score: A sonnet is a type of computer that is used to communicate with other computers.



Scoring entries:  40%|████      | 44/110 [02:06<03:05,  2.80s/it]

Could not convert score: A past tense verb that describes a person laughing is 'laugh'.



Scoring entries:  44%|████▎     | 48/110 [02:17<02:47,  2.71s/it]

Could not convert score: Exclamation.



Scoring entries:  47%|████▋     | 52/110 [02:27<02:30,  2.60s/it]

Could not convert score: 


Scoring entries:  48%|████▊     | 53/110 [02:30<02:36,  2.75s/it]

Could not convert score: Vehicles: Bicycle
Plants: Rose
Animals: Tiger



Scoring entries:  52%|█████▏    | 57/110 [02:41<02:24,  2.74s/it]

Could not convert score: He will be reading a novel inspired by his grandmother.



Scoring entries:  53%|█████▎    | 58/110 [02:44<02:21,  2.73s/it]

Could not convert score: The law was passed by the government.



Scoring entries:  55%|█████▍    | 60/110 [02:49<02:14,  2.69s/it]

Could not convert score: Opinion-based.



Scoring entries:  56%|█████▋    | 62/110 [02:55<02:14,  2.81s/it]

Could not convert score: A synonym for 'hardworking' is 'diligent'.



Scoring entries:  60%|██████    | 66/110 [03:06<02:02,  2.79s/it]

Could not convert score: 3) Water, 2) Water, 1) Water.



Scoring entries:  61%|██████    | 67/110 [03:09<01:59,  2.79s/it]

Could not convert score: The dog chased the cat.



Scoring entries:  64%|██████▎   | 70/110 [03:22<02:27,  3.68s/it]

Could not convert score: A synonym for 'sad' is 'unhappy'.



Scoring entries:  65%|██████▍   | 71/110 [03:25<02:13,  3.42s/it]

Could not convert score: I prefer homemade cookies to store bought.



Scoring entries:  66%|██████▋   | 73/110 [03:30<01:53,  3.06s/it]

Could not convert score: Italian is a language of the Italian peninsula.



Scoring entries:  67%|██████▋   | 74/110 [03:33<01:44,  2.90s/it]

Could not convert score: Technical document



Scoring entries:  68%|██████▊   | 75/110 [13:19<1:43:50, 178.01s/it]

Could not convert score: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Scoring entries:  72%|███████▏  | 79/110 [13:30<23:09, 44.82s/it]   

Could not convert score: The density of the object is 3 grams per cubic centimeter.



Scoring entries:  76%|███████▋  | 84/110 [13:42<04:07,  9.51s/it]

Could not convert score: He remained very calm.



Scoring entries:  77%|███████▋  | 85/110 [13:45<03:04,  7.40s/it]

Could not convert score: The main verb in the sentence is 'barked'.



Scoring entries:  79%|███████▉  | 87/110 [13:49<01:50,  4.81s/it]

Could not convert score: The store was closed.



Scoring entries:  83%|████████▎ | 91/110 [13:59<00:55,  2.90s/it]

Could not convert score: The food was good.



Scoring entries:  85%|████████▍ | 93/110 [14:03<00:45,  2.65s/it]

Could not convert score: 1. Vitamin A
2. Vitamin C
3. Vitamin D


Scoring entries:  87%|████████▋ | 96/110 [14:10<00:34,  2.45s/it]

Could not convert score: My name is Jean-François.



Scoring entries:  88%|████████▊ | 97/110 [14:13<00:31,  2.46s/it]

Could not convert score: 200 centimeters is approximately 7.5 meters.



Scoring entries:  92%|█████████▏| 101/110 [14:22<00:21,  2.35s/it]

Could not convert score: He is a generous man.



Scoring entries:  94%|█████████▎| 103/110 [14:27<00:17,  2.55s/it]

Could not convert score: A neuron consists of three main parts: the cell body, which contains the nucleus; dendrites, which receive signals from other neurons; and an axon, which transmits signals to other neurons, muscles, or glands.



Scoring entries:  98%|█████████▊| 108/110 [14:40<00:04,  2.41s/it]

Could not convert score: 


Scoring entries:  99%|█████████▉| 109/110 [14:42<00:02,  2.39s/it]

Could not convert score: She never forgets to call.



Scoring entries: 100%|██████████| 110/110 [14:45<00:00,  8.05s/it]

Could not convert score: 50 miles per hour is approximately 80.47 kilometers per hour.

64 scores out of 110
Average: 95.80



