In [1]:
!wget https://raw.githubusercontent.com/wanadzhar913/aitinkerers-hackathon-supa-team-werecooked/refs/heads/master/datasets/for_presentation/boolq-eng-val-200.jsonl -q
!wget https://raw.githubusercontent.com/wanadzhar913/aitinkerers-hackathon-supa-team-werecooked/refs/heads/master/datasets/for_presentation/boolq-malay-val-200.jsonl -q
!wget https://raw.githubusercontent.com/wanadzhar913/aitinkerers-hackathon-supa-team-werecooked/refs/heads/master/datasets/for_presentation/fib-eng-val-200.jsonl -q
!wget https://raw.githubusercontent.com/wanadzhar913/aitinkerers-hackathon-supa-team-werecooked/refs/heads/master/datasets/for_presentation/fib-malay-val-200.jsonl -q

In [40]:
!pip install weave flash_attn accelerate bitsandbytes -U -q

In [61]:
import re
import json
from glob import glob
from typing import Dict

import weave
from tqdm.notebook import tqdm

import torch
from transformers import AutoTokenizer, DebertaV2ForSequenceClassification , AutoModelForSequenceClassification , \
                         BitsAndBytesConfig, pipeline

In [62]:
PROJECT_NAME = 'benchmark_malaysian_deberta_llmasajudge_v2'

weave.init(PROJECT_NAME)

Logged in as Weights & Biases user: naqibasri.
View Weave data at https://wandb.ai/naqibasri/benchmark_malaysian_deberta_llmasajudge_v2/weave


<weave.trace.weave_client.WeaveClient at 0x7fca62dd6500>

In [63]:
!nvidia-smi

Mon Oct 21 14:58:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0              32W /  70W |    329MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### 1.0 Load models and dataset

In [64]:
dataset_list = glob('*.jsonl')
dataset_list

['boolq-eng-val-200.jsonl',
 'fib-eng-val-200.jsonl',
 'boolq-malay-val-200.jsonl',
 'fib-malay-val-200.jsonl']

In [65]:
# construct Malay + English dataset
data_all = []

for k in dataset_list:
    with open(k) as fopen:
        for d in tqdm(fopen):
            d = json.loads(d)
            data_all.append(d)

print(f'Size of dataset: {len(data_all)}')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Size of dataset: 800


In [67]:
data_all[0]

{'question': 'does ethanol take more energy make that produces',
 'answer': 0,
 'passage': "All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separat

In [68]:
# construct Malay + English dataset
data_malay = []

for k in dataset_list:
    if 'malay' in k:
        with open(k) as fopen:
            for d in tqdm(fopen):
                d = json.loads(d)
                data_malay.append(d)

print(f'Size of dataset: {len(data_malay)}')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Size of dataset: 400


In [69]:
TORCH_DTYPE = 'bfloat16'

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=getattr(torch, TORCH_DTYPE)
)

In [71]:
tokenizer = AutoTokenizer.from_pretrained('wanadzhar913/malaysian-debertav2-finetune-on-boolq')
model = DebertaV2ForSequenceClassification.from_pretrained(
    'wanadzhar913/malaysian-debertav2-finetune-on-boolq',
    quantization_config = nf4_config
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


### 2.0 Create scoring metrics and Classes

In [99]:
pipe = pipeline(
    "text-classification",
    tokenizer = tokenizer,
    model=model,
    padding=True
)

In [104]:
@weave.op()
def call_llm(passage: str, question: str) -> str:
    """Function to call the LLM and generate output"""
    x = pipe(
        (question,
        passage)
    )
    return 1 if x['label'] == 'entailment' else 0


In [105]:
call_llm('Susu minum ayam','Kucing makan air')

🍩 https://wandb.ai/naqibasri/benchmark_malaysian_deberta_llmasajudge_v2/r/call/0192afa7-37fe-72a1-aa8a-df1552079f08


1

In [106]:
class MalaysianMistralAsAJudge(weave.Model):

    @weave.op
    def predict(self, passage:str, question:str):
        return call_llm(passage=passage, question=question)

The below are scoring metrics we'll be using to evaluate the LLM's outputs.

In [107]:
def accuracy(model_output, answer):
    return {"accuracy": model_output == answer}

In [108]:
class BinaryMetrics(weave.Scorer):
    class_name: str
    eps: float = 1e-8

    @weave.op()
    def summarize(self, score_rows) -> dict:
        # filter out None rows, model may error out sometimes...
        score_rows = [score for score in score_rows if score["correct"] is not None]
        # Compute f1, precision, recall
        tp = sum([not score["negative"] and score["correct"] for score in score_rows])
        fp = sum([not score["negative"] and not score["correct"] for score in score_rows])
        fn = sum([score["negative"] and not score["correct"] for score in score_rows])
        precision = tp / (tp + fp + self.eps)
        recall = tp / (tp + fn + self.eps)
        f1 = 2 * precision * recall / (precision + recall + self.eps)
        result = {"f1": f1, "precision": precision, "recall": recall}
        return result

    @weave.op()
    def score(self, answer: dict, model_output: dict|str) -> dict:

        result = {
            "correct": model_output == answer,
            "negative": not model_output,
        }
        return result

F1 = BinaryMetrics(class_name="consistency")

### 3.0 Run evaluations

In [109]:
# Define prompt_v1
prompt_v1 = """Anda adalah pakar dalam mengesan ketidakkonsistenan fakta dan halusinasi. Anda akan diberi satu dokumen dan satu soalan. Baca
dokumen dan soalan/kenyataan yang diberikan dengan teliti dan kenal pasti Ketidakkonsistenan Fakta (iaitu mana-mana soalan/kenyataan yang
tidak disokong atau bercanggah dengan maklumat dalam dokumen).

### Anda perlu memilih antara dua pilihan berikut:
- Tidak Konsisten dengan Fakta: Jika mana-mana soalan/kenyataan tidak disokong, terjawab atau bercanggah dengan dokumen, labelkannya sebagai 0.
- Konsisten dengan Fakta: Jika semua soalan/kenyataan disokong/terjawab oleh dokumen, labelkannya sebagai 1.

### Sebagai contoh:
Dokumen: "Gajah adalah mamalia besar yang biasanya ditemui di Afrika dan Asia. Mereka hidup dalam kumpulan yang dikenali sebagai kawanan dan terkenal kerana mempunyai ingatan yang baik."

Soalan/Kenyataan: "Gajah adalah mamalia besar yang biasanya ditemui di Eropah."
Jawapan: {{'consistency': 0}}

Soalan/Kenyataan: "Gajah adalah mamalia besar yang biasanya ditemui di Afrika dan Asia."
Jawapan: {{'consistency': 1}}

### Jawab berdasarkan dokumen dan soalan/kenyataan berikut:
Dokumen: {passage}
Soalan/Kenyataan: {question}

Kembalikan jawapan dalam format JSON untuk pilihan yang diberikan. Sebagai contoh: {{'consistency': 1}} atau {{'consistency': 0}}"""

In [110]:
mistralasajudge = MalaysianMistralAsAJudge()

#### 3.1 Evaluate performance on English & Malay texts

In [111]:
evaluation_all = weave.Evaluation(dataset=data_all, scorers=[accuracy, F1])

In [112]:
await evaluation_all.evaluate(mistralasajudge)

🍩 https://wandb.ai/naqibasri/benchmark_malaysian_deberta_llmasajudge_v2/r/call/0192afa7-a461-7b51-b6aa-426a889a8bc9


{'model_output': {'mean': 0.4275},
 'accuracy': {'accuracy': {'true_count': 372, 'true_fraction': 0.465}},
 'BinaryMetrics': {'f1': 0.4636591429600945,
  'precision': 0.540935672498803,
  'recall': 0.40570175437706796},
 'model_latency': {'mean': 0.6870568716526031}}

#### 3.2 Evaluate performance on Malay texts only

In [113]:
evaluation_malay = weave.Evaluation(dataset=data_malay, scorers=[accuracy, F1])

In [114]:
await evaluation_malay.evaluate(mistralasajudge)

🍩 https://wandb.ai/naqibasri/benchmark_malaysian_deberta_llmasajudge_v2/r/call/0192afa8-d534-7d02-a6b6-5a4cba617d1f


{'model_output': {'mean': 0.4425},
 'accuracy': {'accuracy': {'true_count': 190, 'true_fraction': 0.475}},
 'BinaryMetrics': {'f1': 0.48275861574789486,
  'precision': 0.5536723163528998,
  'recall': 0.42794759823458745},
 'model_latency': {'mean': 0.6160252189636231}}