<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/annelie/notebooks/Huggingface_QA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2: Evaluate Dataset

In [1]:
!pip install word2number

import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests
from word2number import w2n
import re

# API setup
key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=key)
ai_model = genai.GenerativeModel('gemini-1.5-flash')

# Read dataset file
url = 'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/qa_dataset.json'
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    print("Retrieved file: qa_dataset.json")
else:
    print("Error while parsing a file: ", response.status_code)

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=640464f10e7f849b44dee6371d41d40a36444fbc359c003ee9a8f8240d67348e
  Stored in directory: /root/.cache/pip/wheels/84/ff/26/d3cfbd971e96c5aa3737ecfced81628830d7359b55fbb8ca3b
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1
Retrieved file: qa_dataset.json


In [2]:
def convert_numbers_in_text(text):
    # Regular expression to find number words contained in questionnaires
    pattern = r'(two thousand|two hundred one|two hundred|fifty-one|thirty-one|twenty-one|sixteen|fifteen|eleven|thirty|twenty|fifty|forty|sixty|ten|five|six|one)'
    # Interesting finding: Regex only works if longer words are in order before shorter that contain similar parts, e.g. fifty-one has to be in front of fifty to work as intended

    def convert(match):
        word = match.group(0)
        try:
            # Convert the word to number
            return str(w2n.word_to_num(word))
        except ValueError:
            return word

    # Replace all number words in the text with their integer equivalents
    converted_text = re.sub(pattern, convert, text, flags=re.IGNORECASE)

    # Now convert ranges like 'twenty to thirty' into '20-30'
    converted_text = re.sub(r'(\d+)\s*(to|and)\s*(\d+)', r'\1-\3', converted_text)

    # Replace text
    # Todo: Dafür noch ne bessere Lösung finden, das ist eig nur n Beispiel und geht auch bei ähnlichen Sätzen nicht
    converted_text = converted_text.replace('more than 2000', 'larger than 2000')
    converted_text = converted_text.replace('More than 2000', 'Larger than 2000')

    return converted_text


def is_exact_or_phrase_match(option, text):
    # Escape the option to handle special characters
    escaped_option = re.escape(option.strip())

    # Pattern to match the option as a full word or part of a phrase
    pattern = rf'\b(?:\w+\s+)*{escaped_option}(?:\s+\w+)*\b'

    # Search for the pattern in the text (case-insensitive)
    return re.search(pattern, text, re.IGNORECASE) is not None

## Evaluate different models

In [3]:
#from transformers import pipeline

#qa_pipeline1 = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [4]:
#qa_pipeline2 = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [5]:
#qa_pipeline3 = pipeline("question-answering", model='google-bert/bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
#Tine-tune model on data

! pip install datasets[torch]
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Step 1: Load and prepare data
def prepare_data(data):
    records = []
    for item in data:
        context = item["answer_text"]
        question = item["question"]
        for option in item["possible_answers"]:
            label = int(option in item["intended_answer"])  # Multi-Label Support
            records.append({"text": f"{question} {context} {option}", "label": label})

    # Convert list of dictionaries into dictionary
    dataset_dict = {key: [record[key] for record in records] for key in records[0]}

    # Convert dictionary into huggingface datset
    return Dataset.from_dict(dataset_dict)

def split_dataset(data, train_size=0.8, random_seed=42):
    train_data, val_data = train_test_split(data, train_size=train_size, random_state=random_seed)
    return train_data, val_data

train_data, val_data = split_dataset(data, train_size=0.6)

train_data_processed = prepare_data(train_data)
val_data_processed = prepare_data(val_data)

dataset = DatasetDict({
    "train": train_data_processed,
    "validation": val_data_processed
})

# Schritt 3: Tokenizer initialisieren
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Schritt 4: Modell initialisieren
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# Schritt 5: Trainingsargumente definieren
training_args = TrainingArguments(
    output_dir="./results",
    report_to="none",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    save_total_limit=2
)

# Schritt 6: Trainer initialisieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
)

# Schritt 7: Training starten
trainer.train()

Collecting datasets[torch]
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[torch])
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[torch])
  Using cached xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets[torch])
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets[torch])
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached fsspec-2024.9.0-py3-none-any.whl (179 kB)
Using cached multiprocess-0.70.16-py310-none-any.whl (134 kB)
Using cached datasets-3.2.0-py3-none-any.whl (480 kB)
Using cached xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Installing collected packages: xxhash, fsspec,

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1086 [00:00<?, ? examples/s]

Map:   0%|          | 0/734 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
# predict answer with fine-tuned model
def predict_answers(data):
    """
    Predict the answer for each option in the JSON data.
    Printing only incorrectly predicted answers.
    """
    print("[INFO] Printing only incorrectly predicted answers.")
    correct_count = 0
    total_count = 0

    for item in data:
        predictions = list()

        # Convert numbers contained in the text to actual integer values
        converted_text = convert_numbers_in_text(item['answer_text'])

        for option in item['possible_answers']:
            # Check for exact match or part of a phrase
            # Todo: Problem: Da der Loop zuerst für unsatisfied durchlaufen wird, wird diesem 95% zugewiesen,
            # erst danach wird very unsatisfied ebenfalls 95% zugewiesen --> falsche Zuordnung
            exact_match = is_exact_or_phrase_match(option, converted_text)
            if exact_match:
                predictions.append((option, 0.95)) # 95 % sure its the correct answer
            else:
                inputs = tokenizer(f"{item['question']} {item['answer_text']} {option}", return_tensors="pt")
                outputs = model(**inputs)
                score = outputs.logits.softmax(dim=-1).tolist()[0][1]  # Wahrscheinlichkeit für "Label 1"
                predictions.append((option, score))

        predicted_option, confidence = max(predictions, key=lambda x: x[1])

        if predicted_option == item['intended_answer']:
            correct_count += 1
        else:
            print(f"Text: {item['answer_text']}")
            print(f"Correct: {item['intended_answer']}, Predicted: {predicted_option}, Confidence: {round(confidence, 4)} \n")
        total_count += 1

    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy

In [None]:
# def predict_answers(data):
#     """
#     Predict the answer for each option in the JSON data.
#     Printing only incorrectly predicted answers.
#     """
#     print("[INFO] Printing only incorrectly predicted answers.")
#     correct_count = 0
#     total_count = 0

#     for item in data:
#         predictions = list()

#         # Convert numbers contained in the text to actual integer values
#         converted_text = convert_numbers_in_text(item['answer_text'])

#         for option in item['possible_answers']:
#             # Check for exact match or part of a phrase
#             # Todo: Problem: Da der Loop zuerst für unsatisfied durchlaufen wird, wird diesem 95% zugewiesen,
#             # erst danach wird very unsatisfied ebenfalls 95% zugewiesen --> falsche Zuordnung
#             exact_match = is_exact_or_phrase_match(option, converted_text)
#             if exact_match:
#                 predictions.append((option, 0.95)) # 95 % sure its the correct answer
#             else:
#                 # Hier den Namen der Pipeline eingeben, die man testen will:
#                 result = qa_pipeline1(question=item['question'], context=f"{converted_text} {option}")
#                 predictions.append((option, result['score']))

#         predicted_option, confidence = max(predictions, key=lambda x: x[1])

#         if predicted_option == item['intended_answer']:
#             correct_count += 1
#         else:
#             print(f"Text: {item['answer_text']}")
#             print(f"Correct: {item['intended_answer']}, Predicted: {predicted_option}, Confidence: {round(confidence, 4)} \n")
#         total_count += 1

#     accuracy = correct_count / total_count if total_count > 0 else 0
#     return accuracy

In [None]:
accuracy = predict_answers(data)
print(f"Accuracy: {accuracy * 100:.2f} %")

## Interesting Findings

*   Prediction of names very bad, because no deeper meaning --> fixed by checking for exact matches
  * Maybe implement name interpreter later?
*   Numerical values (size of company) prediction very bad

* QA Pipelines
  * Pipeline 2 und 3 haben nur eine accuracy von ungefähr 60 %

