In [1]:
!pip install datasets transformers torch tqdm rouge-score sacrebleu

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/547.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_

In [2]:
# loading dataset

from datasets import load_dataset

naits = load_dataset('fahdsoliman/naits_lfqa_processed')
lfqa = load_dataset('fahdsoliman/lfqa_test_with_supports_v1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/597 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/82.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/393k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/108 [00:00<?, ? examples/s]

In [3]:
# loading LFQA model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
model_name = "fahdsoliman/lfqa_naits"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [4]:
from datasets import concatenate_datasets

preprocessed_dataset = concatenate_datasets([naits['test'], lfqa['test']])

In [4]:
preprocessed_dataset = lfqa['test']

In [5]:
len(preprocessed_dataset)

125

In [None]:
lfqa

DatasetDict({
    test: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 108
    })
})

In [None]:
!pip install rouge-score sacrebleu



In [6]:
from datasets import load_metric, load_dataset
from tqdm.auto import tqdm
import torch

all_predictions = []
all_labels = []
def evaluate_model():
    metric_rouge = load_metric("rouge", trust_remote_code=True)
    metric_bleu = load_metric("sacrebleu", trust_remote_code=True)

    model.eval()

    for example in tqdm(preprocessed_dataset):
        # إعداد المدخلات
        inputs = tokenizer("question: {} context: {}".format(example["question"], example["context"]), truncation=True, padding=True, return_tensors="pt")
        # print(inputs)
        reference = example["answer"]

        # توليد التنبؤات
        with torch.no_grad():
            outputs = model.generate(input_ids=inputs["input_ids"].to(device),
                                    attention_mask=inputs["attention_mask"].to(device),
                                    min_length=64,
                                    max_length=256,
                                    do_sample=False,
                                    early_stopping=True,
                                    num_beams=8,
                                    temperature=1.0,
                                    top_k=None,
                                    top_p=None,
                                    eos_token_id=tokenizer.eos_token_id,
                                    no_repeat_ngram_size=3,
                                    num_return_sequences=1)

        prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        # تجميع التنبؤات والمراجع
        all_predictions.append(prediction)
        all_labels.append(reference)

    # حساب ROUGE
    rouge_result = metric_rouge.compute(predictions=all_predictions, references=all_labels, use_stemmer=True)
    rouge_scores = {key: value.mid.fmeasure * 100 for key, value in rouge_result.items()}

    # حساب BLEU
    bleu_result = metric_bleu.compute(predictions=all_predictions, references=[[ref] for ref in all_labels])
    bleu_score = bleu_result["score"]

    print("ROUGE Scores:", rouge_scores)
    print("BLEU Score:", bleu_score)

# استدعاء دالة التقييم
evaluate_model()

  metric_rouge = load_metric("rouge", trust_remote_code=True)


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

  0%|          | 0/125 [00:00<?, ?it/s]

ROUGE Scores: {'rouge1': 29.777228133035372, 'rouge2': 7.629008198000467, 'rougeL': 16.438968250273618, 'rougeLsum': 16.44879506614558}
BLEU Score: 1.492481849665259


In [None]:
all_predictions[1]

['The basic services provided by accredited information security companies include, but are not limited to: developing an information security policy. developing a set of systematic processes to manage and address the repercussions of a security breach. developing plans to deal with emergency incidents. Information systems security audit. vulnerability assessment. security review of the code. Security vulnerability assessment: searching and investigating all potential gaps and weaknesses in an information system for the purpose of classifying them. Recovering lost data.']

In [None]:
all_labels[1]

'Certified information security companies provide several basic services, including developing information security policy, auditing information systems security, developing plans to deal with emergency incidents, risk assessment, security vulnerability assessment, penetration testing, recovering lost data, and security review of the code. These services ensure the protection of information infrastructure and enhance electronic security in private institutions, which contributes to reducing threats and reducing losses resulting from security incidents.'

In [11]:
i = 0
for example in tqdm(preprocessed_dataset):
  print('the question: ')
  print(example['question'])
  print('prediction answer: ')
  print(all_predictions[i][0])
  print('golden answer: ')
  print(all_labels[i])
  i=i+1


  0%|          | 0/125 [00:00<?, ?it/s]

the question: 
what are the main obligations that an accredited company must adhere to after obtaining accreditation?
prediction answer: 
I'm not sure if this is what you're looking for, but I can give you an idea of what the requirements are. I work for a company that has been accredited for a number of years, so I can tell you a little bit about the requirements. First of all, the company has to pass an accreditation test, which is basically a series of tests that the company must pass in order to be considered an accredited company. The company then has to submit a report to the accreditation authority detailing the results of the tests, as well as the requirements for the company to be accredited. This report is then submitted to the authority, and the authority then decides whether or not to grant the company its accreditation. If the company passes the test, then they are considered accredited. If they fail, then the company is not accredited, and they are no longer allowed to do

In [None]:
from datasets import load_metric

# تحميل مقياس BLEU
metric_bleu = load_metric("sacrebleu", trust_remote_code=True)

# أمثلة على التنبؤات والمراجع
predictions = [["This is not a test."]]
references = [[["This is a test."]]]

# حساب BLEU
bleu_result = metric_bleu.compute(predictions=predictions, references=references)
bleu_score = bleu_result["score"]

print("BLEU Score:", bleu_score)


BLEU Score: 59.694917920196445
