In [1]:
import json
import dspy


with open('gen-ai-ucu-2024-task-3/zno.train.jsonl', 'r') as json_file:
    json_list = list(json_file)

all_questions = []
for json_str in json_list:
    result = json.loads(json_str)
    all_questions.append(result)

all_examples = []
for sample in all_questions:
    example = dspy.Example(
        question=sample['question'],
        options=sample['answers'],
        correct_answer = sample['correct_answers'][0]
    ).with_inputs("question", "options")
    all_examples.append(example)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
## load for creating submission
with open('gen-ai-ucu-2024-task-3/zno.test.jsonl', 'r') as json_file:
    json_list = list(json_file)

sumbission_all_questions = []
for json_str in json_list:
    result = json.loads(json_str)
    sumbission_all_questions.append(result)

sumbission_all_examples = []
for sample in sumbission_all_questions:
    example = dspy.Example(
        question=sample['question'],
        options=sample['answers'],
    ).with_inputs("question", "options")
    sumbission_all_examples.append(example)

In [2]:
train_set, test_set = all_examples[int(len(all_questions)*0.2):], all_examples[:int(len(all_questions)*0.2)]

In [None]:
#U need to launch vllm 

# vllm serve unsloth/Meta-Llama-3.1-8B-bnb-4bit --dtype float16 --gpu-memory-utilization 0.8 --max-model-len 13500 --quantization bitsandbytes --load-format bitsandbytes --served-model-name meta-llama/Llama-3.1-8B --trust-remote-code --chat-template llama_3.1_template.jinja

#vllm serve unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit --dtype float16 --gpu-memory-utilization 0.8 --max-model-len 13500 --quantization bitsandbytes --load-format bitsandbytes --served-model-name meta-llama/Llama-3.1-8B-Instruct --trust-remote-code

# vllm serve unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit \
#  --dtype float16 --gpu-memory-utilization 0.8 --max-model-len 13500 --quantization bitsandbytes \
#  --load-format bitsandbytes --served-model-name meta-llama/Llama-3.1-8B-Instruct --trust-remote-code \
#  --enable-lora \
#  --lora-modules mylora120steps=./lora_adapter_llama8b_120


# vllm serve unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit \
#  --dtype float16 --gpu-memory-utilization 0.8 --max-model-len 13500 --quantization bitsandbytes \
#  --load-format bitsandbytes --served-model-name meta-llama/Llama-3.1-8B-Instruct --trust-remote-code \
#  --enable-lora \
#  --lora-modules mylora3epochs=./lora_adapter_llama8b_30epochs

# vllm serve meta-llama/Llama-2-7b-hf \
# vllm serve unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit  --dtype float16 --gpu-memory-utilization 0.8 --max-model-len 13500 --quantization bitsandbytes  --load-format bitsandbytes --served-model-name meta-llama/Llama-3.1-8B-Instruct --trust-remote-code  --enable-lora  --lora-modules raglora_4=./lorai_adapter_llama8b_5epochs
   

In [5]:
lm = dspy.LM('openai/lora_increased',api_base="http://localhost:8000/v1",  # ensure this points to your port
             api_key="local", model_type='chat')
dspy.configure(lm=lm)

In [22]:
lm("Hello new")

["Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?"]

In [None]:
from typing import Literal

class Categorize(dspy.Signature):
    """Solve exam problem."""

    question: str = dspy.InputField()
    options: list[dict[str,str]] = dspy.InputField()
    correct_marker: str = dspy.OutputField()

classify = dspy.Predict(Categorize, False,cache=False)

#loading optimized prompt
classify.load("BootstrapFewShotWithRandomSearch_program_15_candidates_llama.json")

# Here is how we call this module
classification = classify(question = all_examples[-1].question, options = all_examples[-1].options, cache=False)

In [24]:
classification.correct_marker

'А'

In [25]:
from tqdm import tqdm

classifications = []
correct_answers = []
for sample in tqdm(test_set):
    classification = classify(question = sample.question, options = sample.options)
    classifications.append(classification.correct_marker)
    correct_answers.append(sample.correct_answer)

100%|██████████| 612/612 [04:56<00:00,  2.06it/s]


In [26]:
allowed_answers = ['А', 'Б', 'В', 'Г', 'Д', '1', '2', '3', '4', '5', '6', '7', '8', '9']

def post_process_sample(prediction:str) -> str:
    found_answer = 'А'  # Default value

    for allowed_answer in allowed_answers:
        if allowed_answer in prediction:
            found_answer = allowed_answer
            # print(f"answer {found_answer} for {classification}")
            break
    return found_answer

post_processed = []

for classification in classifications:
    found_answer = post_process_sample(classification)
    post_processed.append(found_answer)

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

def evaluate_model(predictions, ground_truth):
    """
    Simple evaluation of classification model.
    
    Parameters:
    predictions (list): Model's predicted answers
    ground_truth (list): Correct answers
    """
    # Calculate accuracy
    accuracy = accuracy_score(ground_truth, predictions)
    
    # Get unique options
    unique_options = sorted(list(set(predictions + ground_truth)))
    
    # Create confusion matrix
    cm = confusion_matrix(ground_truth, predictions, labels=unique_options)
    
    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"\nConfusion Matrix:")
    print("Options:", unique_options)
    print(cm)

## Evaluation results

In [29]:
# after training 5 epochs increased lora
evaluate_model(classifications,correct_answers)

Accuracy: 0.4346

Confusion Matrix:
Options: ['1', '4', 'А', 'Б', 'В', 'Г', 'Д']
[[ 0  0  0  0  1  0  0]
 [ 0  0  0  1  0  0  0]
 [ 0  0 62 16  9 14 11]
 [ 0  0 42 72 20 12 19]
 [ 0  0 34 20 52 17 20]
 [ 0  0 38 16 17 51  9]
 [ 0  0 12  7  4  7 29]]


In [30]:
# after training 5 epochs increased lora
evaluate_model(post_processed,correct_answers)

Accuracy: 0.4346

Confusion Matrix:
Options: ['1', '4', 'А', 'Б', 'В', 'Г', 'Д']
[[ 0  0  0  0  1  0  0]
 [ 0  0  0  1  0  0  0]
 [ 0  0 62 16  9 14 11]
 [ 0  0 42 72 20 12 19]
 [ 0  0 34 20 52 17 20]
 [ 0  0 38 16 17 51  9]
 [ 0  0 12  7  4  7 29]]


In [17]:
# after training 3.25 epochs
evaluate_model(classifications,correct_answers)

Accuracy: 0.4003

Confusion Matrix:
Options: ['1', '4', 'А', 'Б', 'В', 'Г', 'Д']
[[ 0  0  0  0  1  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0 40 16 29 15 12]
 [ 0  0 18 56 36 27 28]
 [ 0  0 16 22 64 17 24]
 [ 0  0 12 12 33 52 22]
 [ 0  0  5  6 12  3 33]]


In [18]:
# after training 3.25 epochs
evaluate_model(post_processed,correct_answers)

Accuracy: 0.4003

Confusion Matrix:
Options: ['1', '4', 'А', 'Б', 'В', 'Г', 'Д']
[[ 0  0  0  0  1  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0 40 16 29 15 12]
 [ 0  0 18 56 36 27 28]
 [ 0  0 16 22 64 17 24]
 [ 0  0 12 12 33 52 22]
 [ 0  0  5  6 12  3 33]]


In [35]:
# after training 120 steps
evaluate_model(classifications,correct_answers)

Accuracy: 0.3121

Confusion Matrix:
Options: ['1', '4', 'А', 'Б', 'В', 'Г', 'Д']
[[ 1  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0 82 13  6  9  2]
 [ 0  0 88 48  8 15  6]
 [ 0  0 79 21 23 11  9]
 [ 0  0 68 17 10 29  7]
 [ 0  0 38  9  0  4  8]]


In [36]:
# after training 120 steps
evaluate_model(post_processed,correct_answers)

Accuracy: 0.3121

Confusion Matrix:
Options: ['1', '4', 'А', 'Б', 'В', 'Г', 'Д']
[[ 1  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0]
 [ 0  0 82 13  6  9  2]
 [ 0  0 88 48  8 15  6]
 [ 0  0 79 21 23 11  9]
 [ 0  0 68 17 10 29  7]
 [ 0  0 38  9  0  4  8]]


In [39]:
evaluate_model(classifications,correct_answers)

Accuracy: 0.2565

Confusion Matrix:
Options: ['"А"', '"Б"', '"В"', '(1)Уміння радіти чужим успіхам мірило шляхетності. (2)Якщо людину підтримати похвалити підкреслити щось хороше в ній вона намагається дорівнювати уявленню яке про неї склалося. (3)Ось чому так важливо з дитинства культивувати в людині почуття власної гідності відчуття певної „собівартості” аби згодом не склався комплекс меншовартості. (4)Недарма кажуть Якщо казати людині постійно що вона свиня то згодом вона зарохкає. (5)Не ввічливо кидати компліменти малознайомим або незнайомим людям. (6)Краще зачекати познайомившись ближче чи навіть заприятелювавши з ними. (7)Але пам’ятаймо що слова похвали потрібні кожному.', '1', '4', '[]', 'А', 'А, Б, В, Г, Д', 'А-1, Б-2, В-3, Г-4, Д-5', 'АВГД', 'Б', 'В', 'Василеві Стусу', 'Г', 'Д']
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

In [61]:
evaluate_model(post_processed,correct_answers)

Accuracy: 0.2712

Confusion Matrix:
Options: ['1', '4', 'А', 'Б', 'В', 'Г', 'Д']
[[ 0  0  1  0  0  0  0]
 [ 0  0  1  0  0  0  0]
 [ 0  0 16 17 67 11  1]
 [ 0  0 17 39 90 14  5]
 [ 0  0 14 24 88 15  2]
 [ 0  0 10 23 77 20  1]
 [ 0  0  6  6 39  5  3]]


## DSPY OPTIMIZATION

In [None]:
def validate_category(example, prediction, trace=None):
    return prediction.category == example.category

In [67]:
def accuracy_metric(gold, pred, trace=None):
    post_processed = post_process_sample(pred.correct_marker)
    return post_processed == gold.correct_answer

In [69]:
from dspy.evaluate import Evaluate
evaluate = Evaluate(
    devset=test_set,
    metric=accuracy_metric,
    num_threads=4,
    display_progress=True,
    display_table=0,
    max_errors=1,
    failure_score = 0
)

# zero-shot evaluation on examples
score,outputs = evaluate(dspy.Predict(Categorize, False),return_outputs=True)

Average Metric: 166.00 / 612 (27.1%): 100%|██████████| 612/612 [00:00<00:00, 1536.86it/s]

2025/01/11 19:00:23 INFO dspy.evaluate.evaluate: Average Metric: 166 / 612 (27.1%)





In [71]:
smaller_train_set = train_set[:451]

In [72]:
len(smaller_train_set)

451

In [73]:
from dspy.teleprompt import (
    BootstrapFewShot,
    BootstrapFewShotWithRandomSearch,
    KNNFewShot,
    MIPROv2,
)

In [74]:
config = {
    "max_bootstrapped_demos": 1,
    "max_labeled_demos": 1,
    "num_candidate_programs": 15,
    "num_threads": 4,
}

optimizer = BootstrapFewShotWithRandomSearch(metric=accuracy_metric, **config)
optimized_program = optimizer.compile(dspy.Predict(Categorize, False), trainset=smaller_train_set,valset=test_set)
score = evaluate(optimized_program)

try:
    optimized_program.save("BootstrapFewShotWithRandomSearch_program_15_candidates_llama.json")
except:
    print("Error saving")
# score for BootstrapFewShotWithRandomSearch evaluation with `ner_metric` function
print(f"BootstrapFewShotWithRandomSearch score : {score}")

Going to sample between 1 and 1 traces per predictor.
Will attempt to bootstrap 15 candidate sets.
Average Metric: 166.00 / 612 (27.1%): 100%|██████████| 612/612 [00:00<00:00, 1195.00it/s]

2025/01/11 19:04:18 INFO dspy.evaluate.evaluate: Average Metric: 166 / 612 (27.1%)



New best score: 27.12 for seed -3
Scores so far: [27.12]
Best score so far: 27.12
Average Metric: 166.00 / 612 (27.1%): 100%|██████████| 612/612 [00:00<00:00, 1796.05it/s]

2025/01/11 19:04:19 INFO dspy.evaluate.evaluate: Average Metric: 166 / 612 (27.1%)



Scores so far: [27.12, 27.12]
Best score so far: 27.12


  1%|          | 5/451 [00:00<00:00, 1594.31it/s]

Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.





Average Metric: 189.00 / 612 (30.9%): 100%|██████████| 612/612 [04:37<00:00,  2.21it/s]

2025/01/11 19:08:56 INFO dspy.evaluate.evaluate: Average Metric: 189 / 612 (30.9%)



New best score: 30.88 for seed -1
Scores so far: [27.12, 27.12, 30.88]
Best score so far: 30.88


  0%|          | 1/451 [00:00<02:51,  2.62it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 189.00 / 612 (30.9%): 100%|██████████| 612/612 [04:21<00:00,  2.34it/s]

2025/01/11 19:13:18 INFO dspy.evaluate.evaluate: Average Metric: 189 / 612 (30.9%)



Scores so far: [27.12, 27.12, 30.88, 30.88]
Best score so far: 30.88


  1%|▏         | 6/451 [00:02<03:34,  2.08it/s]


Bootstrapped 1 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Average Metric: 193.00 / 612 (31.5%): 100%|██████████| 612/612 [04:30<00:00,  2.26it/s]

2025/01/11 19:17:52 INFO dspy.evaluate.evaluate: Average Metric: 193 / 612 (31.5%)



New best score: 31.54 for seed 1
Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54]
Best score so far: 31.54


  1%|          | 5/451 [00:01<02:43,  2.72it/s]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Average Metric: 193.00 / 612 (31.5%): 100%|██████████| 612/612 [04:15<00:00,  2.40it/s]

2025/01/11 19:22:09 INFO dspy.evaluate.evaluate: Average Metric: 193 / 612 (31.5%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54]
Best score so far: 31.54


  0%|          | 2/451 [00:00<02:44,  2.72it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 186.00 / 612 (30.4%): 100%|██████████| 612/612 [04:14<00:00,  2.41it/s]

2025/01/11 19:26:24 INFO dspy.evaluate.evaluate: Average Metric: 186 / 612 (30.4%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39]
Best score so far: 31.54


  0%|          | 1/451 [00:00<03:00,  2.50it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 199.00 / 612 (32.5%): 100%|██████████| 612/612 [04:08<00:00,  2.47it/s]

2025/01/11 19:30:33 INFO dspy.evaluate.evaluate: Average Metric: 199 / 612 (32.5%)



New best score: 32.52 for seed 4
Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52]
Best score so far: 32.52


  0%|          | 2/451 [00:02<09:35,  1.28s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 203.00 / 612 (33.2%): 100%|██████████| 612/612 [04:28<00:00,  2.28it/s]

2025/01/11 19:35:04 INFO dspy.evaluate.evaluate: Average Metric: 203 / 612 (33.2%)



New best score: 33.17 for seed 5
Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17]
Best score so far: 33.17


  0%|          | 2/451 [00:00<02:48,  2.66it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 200.00 / 612 (32.7%): 100%|██████████| 612/612 [04:55<00:00,  2.07it/s]

2025/01/11 19:39:59 INFO dspy.evaluate.evaluate: Average Metric: 200 / 612 (32.7%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68]
Best score so far: 33.17


  1%|          | 5/451 [00:01<02:40,  2.77it/s]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Average Metric: 192.00 / 612 (31.4%): 100%|██████████| 612/612 [04:25<00:00,  2.31it/s]

2025/01/11 19:44:26 INFO dspy.evaluate.evaluate: Average Metric: 192 / 612 (31.4%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68, 31.37]
Best score so far: 33.17


  2%|▏         | 11/451 [00:04<03:17,  2.23it/s]


Bootstrapped 1 full traces after 11 examples for up to 1 rounds, amounting to 11 attempts.
Average Metric: 195.00 / 612 (31.9%): 100%|██████████| 612/612 [04:20<00:00,  2.35it/s]

2025/01/11 19:48:52 INFO dspy.evaluate.evaluate: Average Metric: 195 / 612 (31.9%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68, 31.37, 31.86]
Best score so far: 33.17


  0%|          | 1/451 [00:00<02:51,  2.62it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 188.00 / 612 (30.7%): 100%|██████████| 612/612 [04:16<00:00,  2.39it/s]

2025/01/11 19:53:08 INFO dspy.evaluate.evaluate: Average Metric: 188 / 612 (30.7%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68, 31.37, 31.86, 30.72]
Best score so far: 33.17


  0%|          | 1/451 [00:00<02:39,  2.83it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 185.00 / 612 (30.2%): 100%|██████████| 612/612 [04:19<00:00,  2.36it/s]

2025/01/11 19:57:28 INFO dspy.evaluate.evaluate: Average Metric: 185 / 612 (30.2%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68, 31.37, 31.86, 30.72, 30.23]
Best score so far: 33.17


  0%|          | 1/451 [00:00<02:46,  2.70it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 186.00 / 612 (30.4%): 100%|██████████| 612/612 [04:18<00:00,  2.37it/s]

2025/01/11 20:01:47 INFO dspy.evaluate.evaluate: Average Metric: 186 / 612 (30.4%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68, 31.37, 31.86, 30.72, 30.23, 30.39]
Best score so far: 33.17


  1%|          | 4/451 [00:01<02:48,  2.65it/s]


Bootstrapped 1 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 182.00 / 612 (29.7%): 100%|██████████| 612/612 [04:31<00:00,  2.26it/s]

2025/01/11 20:06:20 INFO dspy.evaluate.evaluate: Average Metric: 182 / 612 (29.7%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68, 31.37, 31.86, 30.72, 30.23, 30.39, 29.74]
Best score so far: 33.17


  0%|          | 1/451 [00:00<00:00, 1343.04it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 199.00 / 612 (32.5%): 100%|██████████| 612/612 [00:00<00:00, 1081.71it/s]

2025/01/11 20:06:20 INFO dspy.evaluate.evaluate: Average Metric: 199 / 612 (32.5%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68, 31.37, 31.86, 30.72, 30.23, 30.39, 29.74, 32.52]
Best score so far: 33.17


  0%|          | 1/451 [00:00<02:41,  2.78it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 195.00 / 612 (31.9%): 100%|██████████| 612/612 [04:19<00:00,  2.36it/s]

2025/01/11 20:10:40 INFO dspy.evaluate.evaluate: Average Metric: 195 / 612 (31.9%)



Scores so far: [27.12, 27.12, 30.88, 30.88, 31.54, 31.54, 30.39, 32.52, 33.17, 32.68, 31.37, 31.86, 30.72, 30.23, 30.39, 29.74, 32.52, 31.86]
Best score so far: 33.17
18 candidate programs found.
Average Metric: 203.00 / 612 (33.2%): 100%|██████████| 612/612 [00:00<00:00, 827.84it/s]


2025/01/11 20:10:41 INFO dspy.evaluate.evaluate: Average Metric: 203 / 612 (33.2%)


BootstrapFewShotWithRandomSearch score : 33.17


## Submission

In [10]:
from tqdm import tqdm
classifications = []
for sample in tqdm(sumbission_all_examples):
    classification = classify(question = sample.question, options = sample.options)
    classifications.append(classification.correct_marker)

100%|██████████| 751/751 [05:41<00:00,  2.20it/s]


In [15]:
import pandas as pd

pd.DataFrame({'id':list(range(751)), "correct_answers":classifications}).to_csv("submission_1.csv",index=False)