<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/annelie/notebooks/Huggingface_QA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2: Evaluate Dataset

In [46]:
!pip install word2number

import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests
from word2number import w2n
import re
import pandas as pd

# Gemini API Setup
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')

# Read dataset file
url = 'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/qa_dataset.json'
data = pd.read_json(url)

data.head()



Unnamed: 0,type,question,options,intended_answer,context,timestamp
0,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Absolutely, I'm totally okay with data process...",2024-12-26 13:54:58.593
1,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Yeah, you have my full consent for data proces...",2024-12-26 13:54:58.593
2,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Oh, data processing consent? Yes, you can go r...",2024-12-26 13:54:58.593
3,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,Regarding your request for data processing con...,2024-12-26 13:54:58.593
4,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Yes, please proceed with the data processing, ...",2024-12-26 13:54:58.593


In [47]:
def convert_numbers_in_text(text):
    # Regular expression to find number words contained in questionnaires
    pattern = r'(two thousand|two hundred one|two hundred|fifty-one|thirty-one|twenty-one|sixteen|fifteen|eleven|thirty|twenty|fifty|forty|sixty|ten|five|six|one)'
    # Interesting finding: Regex only works if longer words are in order before shorter that contain similar parts, e.g. fifty-one has to be in front of fifty to work as intended

    def convert(match):
        word = match.group(0)
        try:
            # Convert the word to number
            return str(w2n.word_to_num(word))
        except ValueError:
            return word

    # Replace all number words in the text with their integer equivalents
    converted_text = re.sub(pattern, convert, text, flags=re.IGNORECASE)

    # Now convert ranges like 'twenty to thirty' into '20-30'
    converted_text = re.sub(r'(\d+)\s*(to|and)\s*(\d+)', r'\1-\3', converted_text)

    # Replace text
    # Todo: Dafür noch ne bessere Lösung finden, das ist eig nur n Beispiel und geht auch bei ähnlichen Sätzen nicht
    converted_text = converted_text.replace('more than 2000', 'larger than 2000')
    converted_text = converted_text.replace('More than 2000', 'Larger than 2000')

    return converted_text


def is_exact_or_phrase_match(option, text):
    # Escape the option to handle special characters
    escaped_option = re.escape(option.strip())

    # Pattern to match the option as a full word or part of a phrase
    pattern = rf'\b(?:\w+\s+)*{escaped_option}(?:\s+\w+)*\b'

    # Search for the pattern in the text (case-insensitive)
    return re.search(pattern, text, re.IGNORECASE) is not None

In [48]:
from transformers import pipeline

qa_pipeline1 = pipeline("question-answering", model="deepset/roberta-base-squad2")

Device set to use cpu


In [49]:
#qa_pipeline2 = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [50]:
#qa_pipeline3 = pipeline("question-answering", model='google-bert/bert-large-uncased-whole-word-masking-finetuned-squad')

## Evaluate Dataset with Pre-trained Models

In [51]:
from pickle import NONE
!pip install dateparser
import dateparser
from datetime import datetime
from datetime import timedelta

def predict_answers_with_types(df):
    """
    Predict the answer for each row in the DataFrame.
    Prints only incorrectly predicted answers.
    """
    print("[INFO] Printing only incorrectly predicted answers.")
    correct_count = 0
    total_count = 0

    for _, row in df.iterrows():

        predictions = []

        # Regex-check only for single- and multi-select questions
        if row['options']:  # Evaluates to False if options is None or empty
            converted_context = convert_numbers_in_text(row['context'])

            for option in row['options']:
                # Check for exact match or part of a phrase
                exact_match = is_exact_or_phrase_match(option, converted_context)
                if exact_match:
                    predictions.append((option, 0.95))  # 95% confidence for exact match
                else:
                    # Use QA pipeline to predict scores
                    result = qa_pipeline1(question=row['question'], context=f"{converted_context} {option}")
                    predictions.append((option, result['score']))

        is_correct = False

        # Handle different question types
        if row['type'] == "SINGLE_SELECT":
            # Predict the option with the highest confidence
            predicted_option, confidence = max(predictions, key=lambda x: x[1])
            is_correct = predicted_option == row['intended_answer']

        elif row['type'] == "MULTI_SELECT":
            # Predict all options with a confidence >= 0.75
            predicted_option = [option for option, score in predictions if score >= 0.75]
            is_correct = set(predicted_option) == set(row['intended_answer'])

        elif row['type'] == "DATE":
            try:
                # Basis-Timestamp aus der Dataframe-Spalte (Unix-Timestamp)
                base_timestamp = pd.Timestamp(row['timestamp'], unit='ms')

                # Extrahiere Zeitangabe aus dem Kontext
                extracted_time = qa_pipeline1(question="What is the proposed time?", context=row['context'])['answer']

                # Konvertiere extrahierte Zeitangabe in Sekunden
                parsed_seconds = dateparser.parse(
                    extracted_time,
                    settings={'RELATIVE_BASE': base_timestamp.to_pydatetime()}
                )
                if not parsed_seconds:
                    raise ValueError(f"Unable to parse date from extracted time: {extracted_time}")

                predicted_option = parsed_seconds
                intended_seconds = int(row['intended_answer'])
                intended_date = base_timestamp + timedelta(seconds=intended_seconds)

                # Vergleich der vorhergesagten und intendierten Daten
                is_correct = predicted_option.date() == intended_date.date()

            except Exception as e:
                print(f"[ERROR] DATE question processing failed: {e}")

        elif row['type'] == "NUMBER":
            question = question = "Which phone number is given in the format +[country code]-[area code]-[local number], similar to +1-452-547-1970?"
            try:
                predicted_option = qa_pipeline1(question=question, context=row['context'])['answer']
                is_correct = predicted_option == row['intended_answer']
            except Exception as e:
                print(f"[ERROR] NUMBER question failed: {e}")

        elif row['type'] == "TEXT":
            predicted_option = row['context']
            is_correct = True

        # Output incorrect predictions
        if not is_correct:
            print(f"Context: {row['context']}")
            print(f"Correct: {row['intended_answer']}, Predicted: {predicted_option}")
            print()

        if is_correct:
            correct_count += 1
        total_count += 1

    # Calculate accuracy
    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy




In [52]:
accuracy = predict_answers_with_types(data)
print(f"Accuracy: {accuracy * 100:.2f} %")

[INFO] Printing only incorrectly predicted answers.
Context: Okay, so when you're talking about the customer group, we're definitely thinking about folks who are either a wholesaler or a distributor, that’s who we're focused on here.
Correct: Wholesaler, Distributor, Predicted: End User

Context: Well, the customer group we need to consider in this case, is made up of both the wholesaler and the distributor types, you see.
Correct: Wholesaler, Distributor, Predicted: End User

Context: If we're looking at the different customer groups, the ones we're targeting are the wholesaler and the distributor, plain and simple.
Correct: Wholesaler, Distributor, Predicted: Consultant, Planner, Architect

Context: So, the customer group that we’ve got is comprised of, let me see, Consultants, naturally, then the Planners, and you can't forget the Architects, they're a big part of it.
Correct: Consultant, Planner, Architect, Predicted: Wholesaler, Distributor

Context: The various customer groups we

## Evaluate Dataset with Fine-tuned Model

In [53]:
# !pip install transformers[torch] sklearn pandas

# import pandas as pd
# from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# # Schritt 1: Datenvorbereitung
# def prepare_data(df):
#     # Erstelle einen neuen DataFrame, der das gewünschte Format enthält
#     records = []
#     for _, row in df.iterrows():
#         question = row["question"]
#         context = row["context"]
#         intended_answer = row["intended_answer"]
#         options = eval(row["options"])  # Umwandeln von String in Liste, falls nötig

#         for option in options:
#             label = int(option == intended_answer)
#             records.append({"text": f"{question} {context} {option}", "label": label})

#     return pd.DataFrame(records)

# # DataFrame laden (dein Screenshot zeigt bereits eine Beispielstruktur)
# # Beispiel: Falls du die Daten als CSV speicherst
# # df = pd.read_csv("your_data.csv")

# # Datenaufteilen in Trainings- und Validierungsdaten
# train_df, val_df = train_test_split(prepare_data(df), train_size=0.8, random_state=42)

# # Schritt 2: Tokenizer initialisieren
# model_checkpoint = "distilbert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# # Tokenisierung
# def tokenize_function(examples):
#     return tokenizer(examples["text"].tolist(), padding="max_length", truncation=True, return_tensors="pt")

# # Tokenisierung der Trainings- und Validierungsdaten
# train_encodings = tokenize_function(train_df)
# val_encodings = tokenize_function(val_df)

# # Labels extrahieren
# train_labels = train_df["label"].values
# val_labels = val_df["label"].values

# # Schritt 3: Huggingface Dataset verwenden (wenn nötig)
# import torch
# from torch.utils.data import Dataset

# class CustomDataset(Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __len__(self):
#         return len(self.labels)

#     def __getitem__(self, idx):
#         item = {key: val[idx] for key, val in self.encodings.items()}
#         item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
#         return item

# train_dataset = CustomDataset(train_encodings, train_labels)
# val_dataset = CustomDataset(val_encodings, val_labels)

# # Schritt 4: Modell initialisieren
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# # Schritt 5: Trainingsargumente definieren
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     save_steps=500,
#     save_total_limit=2
# )

# # Schritt 6: Trainer initialisieren
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer
# )

# # Schritt 7: Training starten
# trainer.train()


# Evaluate different question types with fine-tuned model

In [54]:
# # predict answer with fine-tuned model
# def predict_answers_finetuned(data):
#     """
#     Predict the answer for each option in the JSON data.
#     Printing only incorrectly predicted answers.
#     """
#     print("[INFO] Printing only incorrectly predicted answers.")
#     correct_count = 0
#     total_count = 0

#     for item in data:
#         predictions = list()

#         # Convert numbers contained in the text to actual integer values
#         converted_text = convert_numbers_in_text(item['answer_text'])


#         for option in item['possible_answers']:
#             # Check for exact match or part of a phrase
#             # Todo: Problem: Da der Loop zuerst für unsatisfied durchlaufen wird, wird diesem 95% zugewiesen,
#             # erst danach wird very unsatisfied ebenfalls 95% zugewiesen --> falsche Zuordnung
#             exact_match = is_exact_or_phrase_match(option, converted_text)
#             if exact_match:
#                 predictions.append((option, 0.95)) # 95 % sure its the correct answer
#             else:
#                 inputs = tokenizer(f"{item['question']} {item['answer_text']} {option}", return_tensors="pt")
#                 outputs = model(**inputs)
#                 score = outputs.logits.softmax(dim=-1).tolist()[0][1]  # Wahrscheinlichkeit für "Label 1"
#                 predictions.append((option, score))

#             # Handle different question types differently
#             is_correct = False

#             if item.get('type') == "SINGLE_SELECT":
#                 predicted_option, confidence = max(predictions, key=lambda x: x[1])
#                 is_correct = predicted_option == item['intended_answer']

#             elif item.get('type') == "MULTI_SELECT":
#                 predicted_option = [option for option, score in predictions if score >= 0.75]
#                 is_correct = set(predicted_option) == set(item['intended_answer']) #compare sets
#                 confidence = None  # Kein einzelner Confidence-Wert relevant

#             #elif item.get('type') == "DATE":

#             #elif item.get('type') == "NUMBER":

#             #elif item.get('type') == "TEXT":

#             # Output incorrect predictions
#             if not is_correct:
#                 print(f"Text: {item['answer_text']}")
#                 print(f"Correct: {item['intended_answer']}, Predicted: {predicted_option}")
#                 if confidence is not None:
#                     print(f"Confidence: {round(confidence, 4)}")
#                 print()

#             if is_correct:
#                 correct_count += 1
#             total_count += 1

#     accuracy = correct_count / total_count if total_count > 0 else 0
#     return accuracy

In [55]:
# accuracy = predict_answers_finetuned(data)
# print(f"Accuracy: {accuracy * 100:.2f} %")

## Evaluate continuous text

## Interesting Findings

*   Prediction of names very bad, because no deeper meaning --> fixed by checking for exact matches
  * Maybe implement name interpreter later?
*   Numerical values (size of company) prediction very bad

* QA Pipelines
  * Pipeline 2 und 3 haben nur eine accuracy von ungefähr 60 %

