<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/annelie/notebooks/Huggingface_QA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2: Evaluate Dataset

In [1]:
!pip install word2number

import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests
from word2number import w2n
import re
import pandas as pd

# Gemini API Setup
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')

# Read dataset file
url = 'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/qa_dataset.json'
data = pd.read_json(url)

data.head()

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=0bcdd6b36dc476125ab2c8b74242852b41513726f2d42740f18ab09479071add
  Stored in directory: /root/.cache/pip/wheels/cd/ef/ae/073b491b14d25e2efafcffca9e16b2ee6d114ec5c643ba4f06
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


Unnamed: 0,type,question,options,intended_answer,context
0,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Yes, I absolutely give my consent for data pro..."
1,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Oh yes, definitely, I'm happy for my data to b..."
2,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Yes, I agree to the data processing, no proble..."
3,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,Yes! Data processing consent granted.
4,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Yep, that's a yes from me on data processing c..."


In [2]:
def convert_numbers_in_text(text):
    # Regular expression to find number words contained in questionnaires
    pattern = r'(two thousand|two hundred one|two hundred|fifty-one|thirty-one|twenty-one|sixteen|fifteen|eleven|thirty|twenty|fifty|forty|sixty|ten|five|six|one)'
    # Interesting finding: Regex only works if longer words are in order before shorter that contain similar parts, e.g. fifty-one has to be in front of fifty to work as intended

    def convert(match):
        word = match.group(0)
        try:
            # Convert the word to number
            return str(w2n.word_to_num(word))
        except ValueError:
            return word

    # Replace all number words in the text with their integer equivalents
    converted_text = re.sub(pattern, convert, text, flags=re.IGNORECASE)

    # Now convert ranges like 'twenty to thirty' into '20-30'
    converted_text = re.sub(r'(\d+)\s*(to|and)\s*(\d+)', r'\1-\3', converted_text)

    # Replace text
    # Todo: Dafür noch ne bessere Lösung finden, das ist eig nur n Beispiel und geht auch bei ähnlichen Sätzen nicht
    converted_text = converted_text.replace('more than 2000', 'larger than 2000')
    converted_text = converted_text.replace('More than 2000', 'Larger than 2000')

    return converted_text


def is_exact_or_phrase_match(option, text):
    # Escape the option to handle special characters
    escaped_option = re.escape(option.strip())

    # Pattern to match the option as a full word or part of a phrase
    pattern = rf'\b(?:\w+\s+)*{escaped_option}(?:\s+\w+)*\b'

    # Search for the pattern in the text (case-insensitive)
    return re.search(pattern, text, re.IGNORECASE) is not None

## Evaluate single-select questions with different models

In [3]:
from transformers import pipeline

qa_pipeline1 = pipeline("question-answering", model="deepset/roberta-base-squad2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
#qa_pipeline2 = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [None]:
#qa_pipeline3 = pipeline("question-answering", model='google-bert/bert-large-uncased-whole-word-masking-finetuned-squad')

In [4]:
def predict_answers(df):
    """
    Predict the answer for each row in the DataFrame.
    Prints only incorrectly predicted answers.
    """
    print("[INFO] Printing only incorrectly predicted answers.")
    correct_count = 0
    total_count = 0

    for index, row in df.iterrows():
        predictions = []

        # Convert numbers contained in the context to actual integer values
        converted_context = convert_numbers_in_text(row['context'])

        for option in row['options']:
            # Check for exact match or part of a phrase
            exact_match = is_exact_or_phrase_match(option, converted_context)
            if exact_match:
                predictions.append((option, 0.95))  # 95% sure it's the correct answer
            else:
                # Pipeline to test, replace 'qa_pipeline1' with the actual pipeline function
                result = qa_pipeline1(question=row['question'], context=f"{converted_context} {option}")
                predictions.append((option, result['score']))

        # Determine the prediction with the highest confidence
        predicted_option, confidence = max(predictions, key=lambda x: x[1])

        if predicted_option == row['intended_answer']:
            correct_count += 1
        else:
            print(f"Context: {row['context']}")
            print(f"Correct: {row['intended_answer']}, Predicted: {predicted_option}, Confidence: {round(confidence, 4)} \n")
        total_count += 1

    # Calculate accuracy
    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy

In [5]:
accuracy = predict_answers(data)
print(f"Accuracy: {accuracy * 100:.2f} %")

[INFO] Printing only incorrectly predicted answers.
Context: Nope, I'm not giving my consent for that data processing thing.
Correct: No, Predicted: Yes, Confidence: 0.0 

Context: Absolutely not, I'm not consenting to data processing.
Correct: No, Predicted: Yes, Confidence: 0.0 

Context: The customer group?  In this case, it's the end users we're talking about.
Correct: End User, Predicted: Wholesaler, Distributor, Confidence: 0.0123 

Context: Oh, the customer group?  That would be wholesalers and distributors.
Correct: Wholesaler, Distributor, Predicted: Consultant, Planner, Architect, Confidence: 0.1977 

Context: It's wholesalers and distributors that I'm focusing on for this customer group.
Correct: Wholesaler, Distributor, Predicted: Consultant, Planner, Architect, Confidence: 0.1988 

Context: For this customer group, the main players are wholesalers and distributors.
Correct: Wholesaler, Distributor, Predicted: End User, Confidence: 0.0034 

Context: Oh, the customer group? 

TypeError: 'NoneType' object is not iterable

# Evaluate different question types

In [None]:
from pickle import NONE
!pip install dateparser
import dateparser
from datetime import datetime

def predict_answers_with_types(data):
    """
    Predict the answer for each option in the JSON data.
    Printing only incorrectly predicted answers.
    """
    print("[INFO] Printing only incorrectly predicted answers.")
    correct_count = 0
    total_count = 0

    for item in data:
        predictions = list()

        # Convert numbers contained in the text to actual integer values
        converted_text = convert_numbers_in_text(item['answer_text'])

        for option in item['possible_answers']:
            # Check for exact match or part of a phrase
            # Todo: Problem: Da der Loop zuerst für unsatisfied durchlaufen wird, wird diesem 95% zugewiesen,
            # erst danach wird very unsatisfied ebenfalls 95% zugewiesen --> falsche Zuordnung
            exact_match = is_exact_or_phrase_match(option, converted_text)
            if exact_match:
                predictions.append((option, 0.95)) # 95 % sure its the correct answer
            else:
                # Hier den Namen der Pipeline eingeben, die man testen will:
                result = qa_pipeline1(question=item['question'], context=f"{converted_text} {option}")
                predictions.append((option, result['score']))

            # Handle different question types differently
            is_correct = False

            if item.get('type') == "SINGLE_SELECT":
                predicted_option, confidence = max(predictions, key=lambda x: x[1])
                is_correct = predicted_option == item['intended_answer']

            elif item.get('type') == "MULTI_SELECT":
                predicted_option = [option for option, score in predictions if score >= 0.75]
                is_correct = set(predicted_option) == set(item['intended_answer']) #compare sets
                confidence = None  # Kein einzelner Confidence-Wert relevant

            elif item.get('type') == "DATE":
            # Zeitangabe aus dem Antworttext extrahieren
                question = "What is the proposed time?"
                try:
                    extracted_time = qa_pipeline1(question=question, context=item['answer_text'])['answer']
                    print(f"[INFO] Extracted Time: {extracted_time}")

                    # Basistimestamp berechnen (hier ein fiktiver Timestamp, dieser muss aus den Daten kommen)
                    base_timestamp = datetime(2025, 1, 14)  # Beispiel: Heute

                    # Datum basierend auf extrahierter Zeitangabe berechnen
                    user_date = dateparser.parse(extracted_time, settings={'RELATIVE_BASE': base_timestamp})
                    if not user_date:
                        raise ValueError(f"Unable to parse date from extracted time: {extracted_time}")

                    # Datum aus intended_answer berechnen
                    intended_seconds = item['intended_answer'][0]
                    intended_date = base_timestamp + timedelta(seconds=intended_seconds)
                    predicted_option = extracted_time
                    confidence = None

                    # Daten vergleichen
                    is_correct = user_date.date() == intended_date.date()

                except Exception as e:
                    print(f"[ERROR] Failed to process DATE question: {e}")
                    is_correct = False

            #elif item.get('type') == "NUMBER":

            #elif item.get('type') == "TEXT":

            # Output incorrect predictions
            if not is_correct:
                print(f"Text: {item['answer_text']}")
                print(f"Correct: {item['intended_answer']}, Predicted: {predicted_option}")
                if confidence is not None:
                    print(f"Confidence: {round(confidence, 4)}")
                print()

            if is_correct:
                correct_count += 1
            total_count += 1

    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy

Collecting dateparser
  Downloading dateparser-1.2.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/295.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dateparser
Successfully installed dateparser-1.2.0


In [None]:
accuracy = predict_answers_with_types(data)
print(f"Accuracy: {accuracy * 100:.2f} %")

# Fine-tune model

In [None]:
# #Fine-tune model on data

# ! pip install datasets[torch]
# from datasets import Dataset, DatasetDict
# from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# # Step 1: Load and prepare data
# def prepare_data(data):
#     records = []
#     for item in data:
#         context = item["answer_text"]
#         question = item["question"]
#         for option in item["possible_answers"]:
#             label = int(option in item["intended_answer"])  # Multi-Label Support
#             records.append({"text": f"{question} {context} {option}", "label": label})

#     # Convert list of dictionaries into dictionary
#     dataset_dict = {key: [record[key] for record in records] for key in records[0]}

#     # Convert dictionary into huggingface dataset
#     return Dataset.from_dict(dataset_dict)

# def split_dataset(data, train_size=0.8, random_seed=42):
#     train_data, val_data = train_test_split(data, train_size=train_size, random_state=random_seed)
#     return train_data, val_data

# train_data, val_data = split_dataset(data, train_size=0.3)

# train_data_processed = prepare_data(train_data)
# val_data_processed = prepare_data(val_data)

# dataset = DatasetDict({
#     "train": train_data_processed,
#     "validation": val_data_processed
# })

# # Schritt 3: Tokenizer initialisieren
# model_checkpoint = "distilbert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True)

# tokenized_datasets = dataset.map(tokenize_function, batched=True)

# # Schritt 4: Modell initialisieren
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# # Schritt 5: Trainingsargumente definieren
# training_args = TrainingArguments(
#     output_dir="./results",
#     report_to="none",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     gradient_accumulation_steps=2,
#     gradient_checkpointing=True,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=50,
#     save_steps=500,
#     save_total_limit=2
# )

# # Schritt 6: Trainer initialisieren
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     tokenizer=tokenizer
# )

# # Schritt 7: Training starten
# trainer.train()

# Evaluate different question types with fine-tuned model

In [None]:
# # predict answer with fine-tuned model
# def predict_answers_finetuned(data):
#     """
#     Predict the answer for each option in the JSON data.
#     Printing only incorrectly predicted answers.
#     """
#     print("[INFO] Printing only incorrectly predicted answers.")
#     correct_count = 0
#     total_count = 0

#     for item in data:
#         predictions = list()

#         # Convert numbers contained in the text to actual integer values
#         converted_text = convert_numbers_in_text(item['answer_text'])


#         for option in item['possible_answers']:
#             # Check for exact match or part of a phrase
#             # Todo: Problem: Da der Loop zuerst für unsatisfied durchlaufen wird, wird diesem 95% zugewiesen,
#             # erst danach wird very unsatisfied ebenfalls 95% zugewiesen --> falsche Zuordnung
#             exact_match = is_exact_or_phrase_match(option, converted_text)
#             if exact_match:
#                 predictions.append((option, 0.95)) # 95 % sure its the correct answer
#             else:
#                 inputs = tokenizer(f"{item['question']} {item['answer_text']} {option}", return_tensors="pt")
#                 outputs = model(**inputs)
#                 score = outputs.logits.softmax(dim=-1).tolist()[0][1]  # Wahrscheinlichkeit für "Label 1"
#                 predictions.append((option, score))

#             # Handle different question types differently
#             is_correct = False

#             if item.get('type') == "SINGLE_SELECT":
#                 predicted_option, confidence = max(predictions, key=lambda x: x[1])
#                 is_correct = predicted_option == item['intended_answer']

#             elif item.get('type') == "MULTI_SELECT":
#                 predicted_option = [option for option, score in predictions if score >= 0.75]
#                 is_correct = set(predicted_option) == set(item['intended_answer']) #compare sets
#                 confidence = None  # Kein einzelner Confidence-Wert relevant

#             #elif item.get('type') == "DATE":

#             #elif item.get('type') == "NUMBER":

#             #elif item.get('type') == "TEXT":

#             # Output incorrect predictions
#             if not is_correct:
#                 print(f"Text: {item['answer_text']}")
#                 print(f"Correct: {item['intended_answer']}, Predicted: {predicted_option}")
#                 if confidence is not None:
#                     print(f"Confidence: {round(confidence, 4)}")
#                 print()

#             if is_correct:
#                 correct_count += 1
#             total_count += 1

#     accuracy = correct_count / total_count if total_count > 0 else 0
#     return accuracy

In [None]:
# accuracy = predict_answers_finetuned(data)
# print(f"Accuracy: {accuracy * 100:.2f} %")

[INFO] Printing only incorrectly predicted answers.


UnboundLocalError: local variable 'is_correct' referenced before assignment

# Evaluate continuous text with fine-tuned model

## Interesting Findings

*   Prediction of names very bad, because no deeper meaning --> fixed by checking for exact matches
  * Maybe implement name interpreter later?
*   Numerical values (size of company) prediction very bad

* QA Pipelines
  * Pipeline 2 und 3 haben nur eine accuracy von ungefähr 60 %

