<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/annelie/notebooks/Huggingface_QA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2: Evaluate Dataset

In [1]:
!pip install word2number

import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests
from word2number import w2n
import re
import pandas as pd

# Gemini API Setup
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')

# Read dataset file
url = 'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/qa_dataset.json'
data = pd.read_json(url)

data.head()

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=a40e54850482354214d98c1f040be3ffb5892f68b965179bfc8a0aa7fcfe5317
  Stored in directory: /root/.cache/pip/wheels/cd/ef/ae/073b491b14d25e2efafcffca9e16b2ee6d114ec5c643ba4f06
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


Unnamed: 0,type,question,options,intended_answer,context,timestamp
0,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Absolutely, I'm totally okay with data process...",2024-12-26 13:54:58.593
1,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Yeah, you have my full consent for data proces...",2024-12-26 13:54:58.593
2,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Oh, data processing consent? Yes, you can go r...",2024-12-26 13:54:58.593
3,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,Regarding your request for data processing con...,2024-12-26 13:54:58.593
4,SINGLE_SELECT,Data processing consent,"[Yes, No]",Yes,"Yes, please proceed with the data processing, ...",2024-12-26 13:54:58.593


In [2]:
def convert_numbers_in_text(text):
    # Regular expression to find number words contained in questionnaires
    pattern = r'(two thousand|two hundred one|two hundred|fifty-one|thirty-one|twenty-one|sixteen|fifteen|eleven|thirty|twenty|fifty|forty|sixty|ten|five|six|one)'
    # Interesting finding: Regex only works if longer words are in order before shorter that contain similar parts, e.g. fifty-one has to be in front of fifty to work as intended

    def convert(match):
        word = match.group(0)
        try:
            # Convert the word to number
            return str(w2n.word_to_num(word))
        except ValueError:
            return word

    # Replace all number words in the text with their integer equivalents
    converted_text = re.sub(pattern, convert, text, flags=re.IGNORECASE)

    # Now convert ranges like 'twenty to thirty' into '20-30'
    converted_text = re.sub(r'(\d+)\s*(to|and)\s*(\d+)', r'\1-\3', converted_text)

    # Replace text
    # Todo: Dafür noch ne bessere Lösung finden, das ist eig nur n Beispiel und geht auch bei ähnlichen Sätzen nicht
    converted_text = converted_text.replace('more than 2000', 'larger than 2000')
    converted_text = converted_text.replace('More than 2000', 'Larger than 2000')

    return converted_text


def is_exact_or_phrase_match(option, text):
    # Escape the option to handle special characters
    escaped_option = re.escape(option.strip())

    # Pattern to match the option as a full word or part of a phrase
    pattern = rf'\b(?:\w+\s+)*{escaped_option}(?:\s+\w+)*\b'

    # Search for the pattern in the text (case-insensitive)
    return re.search(pattern, text, re.IGNORECASE) is not None

In [3]:
from transformers import pipeline

qa_pipeline1 = pipeline("question-answering", model="deepset/roberta-base-squad2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
qa_pipeline2 = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [None]:
qa_pipeline3 = pipeline("question-answering", model='google-bert/bert-large-uncased-whole-word-masking-finetuned-squad')

## Evaluate Dataset with Pre-trained Models

In [4]:
from pickle import NONE
!pip install dateparser
import dateparser
from datetime import datetime
from datetime import timedelta

def predict_answers(df, pipeline):
    """
    Predict the answer for each row in the DataFrame.
    Prints only incorrectly predicted answers.
    """
    print("[INFO] Printing only incorrectly predicted answers.")
    correct_count = 0
    total_count = 0
    qa_pipeline = pipeline

    for _, row in df.iterrows():

        predictions = []

        # Regex-check only for single- and multi-select questions
        if row['options']:  # Evaluates to False if options is None or empty
            converted_context = convert_numbers_in_text(row['context'])

            for option in row['options']:
                # Check for exact match or part of a phrase
                exact_match = is_exact_or_phrase_match(option, converted_context)
                if exact_match:
                    predictions.append((option, 0.95))  # 95% confidence for exact match
                else:
                    result = qa_pipeline(question=row['question'], context=f"{converted_context} {option}")
                    predictions.append((option, result['score']))

        is_correct = False

        # Handle different question types
        if row['type'] == "SINGLE_SELECT":
            # Predict the option with the highest confidence
            predicted_option, confidence = max(predictions, key=lambda x: x[1])
            is_correct = predicted_option == row['intended_answer']

        elif row['type'] == "MULTI_SELECT":
            predicted_option = [option for option, score in predictions if score >= 0.95]
            is_correct = set(predicted_option) == set(row['intended_answer'])

        elif row['type'] == "DATE":
            try:
                # Basis-Timestamp aus der Dataframe-Spalte (Unix-Timestamp)
                base_timestamp = pd.Timestamp(row['timestamp'], unit='ms')

                # Extrahiere Zeitangabe aus dem Kontext
                extracted_time = qa_pipeline(question=row['question'], context=row['context'])['answer']

                # Konvertiere extrahierte Zeitangabe in Sekunden
                parsed_seconds = dateparser.parse(
                    extracted_time,
                    settings={'RELATIVE_BASE': base_timestamp.to_pydatetime()}
                )
                if not parsed_seconds:
                    raise ValueError(f"Unable to parse date from extracted time: {extracted_time}")

                predicted_option = parsed_seconds
                intended_seconds = int(row['intended_answer'])
                intended_date = base_timestamp + timedelta(seconds=intended_seconds)

                # Vergleich der vorhergesagten und intendierten Daten
                is_correct = predicted_option.date() == intended_date.date()

            except Exception as e:
                print(f"[ERROR] DATE question processing failed: {e}")

        elif row['type'] == "NUMBER":
            try:
                predicted_option = qa_pipeline(question=row['question'], context=row['context'])['answer']
                is_correct = predicted_option == row['intended_answer']
            except Exception as e:
                print(f"[ERROR] NUMBER question failed: {e}")

        elif row['type'] == "TEXT":
            continue
            # predicted_option = row['context']
            # is_correct = True

        # Output incorrect predictions
        if not is_correct:
            print(f"Context: {row['context']}")
            print(f"Correct: {row['intended_answer']}, Predicted: {predicted_option}")
            print()

        if is_correct:
            correct_count += 1
        total_count += 1

    # Calculate accuracy
    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy


Collecting dateparser
  Downloading dateparser-1.2.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/295.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dateparser
Successfully installed dateparser-1.2.0


In [None]:
accuracy = predict_answers(data, qa_pipeline1)
print(f"Accuracy: {accuracy * 100:.2f} %")

[INFO] Printing only incorrectly predicted answers.
Context: Okay, so when you're talking about the customer group, we're definitely thinking about folks who are either a wholesaler or a distributor, that’s who we're focused on here.
Correct: Wholesaler, Distributor, Predicted: End User

Context: Well, the customer group we need to consider in this case, is made up of both the wholesaler and the distributor types, you see.
Correct: Wholesaler, Distributor, Predicted: End User

Context: If we're looking at the different customer groups, the ones we're targeting are the wholesaler and the distributor, plain and simple.
Correct: Wholesaler, Distributor, Predicted: Consultant, Planner, Architect

Context: So, the customer group that we’ve got is comprised of, let me see, Consultants, naturally, then the Planners, and you can't forget the Architects, they're a big part of it.
Correct: Consultant, Planner, Architect, Predicted: Wholesaler, Distributor

Context: The various customer groups we

## Fine-tune SQUAD-Model

In [15]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
!pip install datasets
from datasets import Dataset

# Funktion zur Vorbereitung der Daten im SQuAD-Format
def prepare_squad_data(df):
    squad_data = {"data": []}

    for idx, row in df.iterrows():
        question = row["question"]
        context = row["context"]
        intended_answer = row["intended_answer"]
        if intended_answer is None: #TEXT questions are not used
            continue

        start_positions = []
        end_positions = []

        for answer in intended_answer:
            start = context.find(answer)
            if start != -1:
              start_positions.append(start)
              end_positions.append(start + len(answer))
            if answer not in context:
              print(f"Warnung: Antwort '{intended_answer}' nicht im Kontext enthalten. Überspringe Datensatz.")
              continue

        # Konvertieren in das SQuAD-Format
        squad_data["data"].append({
    "paragraphs": [
        {
            "context": context,
            "qas": [
                {
                    "question": question,
                    "id": f"q_{idx}",
                    "answers": [
                        {"text": answer, "answer_start": start} for answer in intended_answer
                    ],
                    "is_impossible": False
                        }
                    ]
                }
            ]
        })

    return squad_data



In [16]:
# Funktion zur Tokenisierung
def tokenize_squad_data(squad_data, tokenizer, max_length=512):
    """
    Tokenisiert das vorbereitete SQuAD-Dataset.
    """
    tokenized_examples = []

    for data in squad_data["data"]:
        for paragraph in data["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answers = qa["answers"]
                start_positions = [answer["answer_start"] for answer in answers]
                answer_texts = [answer["text"] for answer in answers]

                # Tokenize Frage und Kontext
                tokenized_example = tokenizer(
                    question,
                    context,
                    max_length=max_length,
                    truncation="only_second",  # Kontext wird bei Überlänge abgeschnitten
                    padding="max_length",
                    return_offsets_mapping=True
                )

                # Berechne Token-Startpositionen der Antworten
                offsets = tokenized_example.pop("offset_mapping")
                token_start_positions = []
                token_end_positions = []
                for start_position, answer_text in zip(start_positions, answer_texts):
                    for i, (offset_start, offset_end) in enumerate(offsets):
                        if offset_start <= start_position < offset_end:
                            token_start_positions.append(i)
                            token_end_positions.append(i + len(answer_text))
                            break

                # Tokenisierte Daten speichern
                # Check if token_start_positions is not empty before accessing element 0
                if token_start_positions:
                    tokenized_example["start_positions"] = token_start_positions[0]
                    tokenized_example["end_positions"] = token_end_positions[0]
                    tokenized_examples.append(tokenized_example)
                else:
                    # Handle the case where the answer is not found in the context
                    print(f"Warning: Answer not found in context for question: {question}")
                    print(f"Intended answer: {answer_texts}")
                    print(f"Context: {context}")
    return tokenized_examples

In [24]:
# Test data nicht umwandeln
train_data, test_data = train_test_split(data, test_size=0.7, random_state=42)

# SQuAD-Daten vorbereiten
squad_data = prepare_squad_data(train_data)

# Tokenizer laden
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

# Tokenisierte Daten erstellen
tokenized_data = tokenize_squad_data(squad_data, tokenizer)

# Aufteilen in Trainings- und Validierungsdaten
train_data, val_data = train_test_split(tokenized_data, test_size=0.2, random_state=42)

# Dataset-Klasse für PyTorch
class SquadDataset:
    def __init__(self, data):
        self.data = Dataset.from_dict({k: [v] for k, v in data[0].items()})
        for item in data[1:]:
            self.data = self.data.add_item(item)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            "input_ids": item["input_ids"], # Return as NumPy array or list
            "attention_mask": item["attention_mask"], # Return as NumPy array or list
            "start_positions": item["start_positions"], # Return as NumPy array or list
            "end_positions": item["end_positions"] # Return as NumPy array or list
        }

# PyTorch Datasets erstellen
train_dataset = SquadDataset(train_data)
val_dataset = SquadDataset(val_data)
print(len(train_dataset))

Warnung: Antwort '1814400' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort '1814400' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort '1814400' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort '1814400' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort '1814400' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort '1814400' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort '1814400' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort 'Wholesaler, Distributor' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort 'Wholesaler, Distributor' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort 'Track project progress' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort 'Review sales data' nicht im Kontext enthalten. Überspringe Datensatz.
Warnung: Antwort 'Education sector' nicht im Kontext enthalten. Überspringe Datensatz

In [26]:
# Modell laden
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

# TrainingArguments definieren
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss"
)

# Trainer erstellen
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [27]:
# Training starten
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,3.4596,2.892447
2,2.528,2.768064
3,2.4798,2.71767


TrainOutput(global_step=114, training_loss=3.0241787726419016, metrics={'train_runtime': 169.2828, 'train_samples_per_second': 5.317, 'train_steps_per_second': 0.673, 'total_flos': 235167081062400.0, 'train_loss': 3.0241787726419016, 'epoch': 3.0})

In [28]:
# Save model
from google.colab import drive
drive.mount('/content/drive')
!cp -r ./results /content/drive/MyDrive

Mounted at /content/drive


In [32]:
# Pfad zu deinem trainierten Modell
model_path = '/content/drive/MyDrive/results/checkpoint-114'

# Modell und Tokenizer laden
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


# Erstelle eine Pipeline mit dem trainierten Modell
qa_pipeline_squad = pipeline("question-answering", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [33]:
accuracy = predict_answers(test_data, qa_pipeline_squad)
print(f"Accuracy: {accuracy * 100:.2f} %")

[INFO] Printing only incorrectly predicted answers.
Context: For contact purposes, my phone number is +1-452-547-1970, I'm usually available during business hours or evenings, but just try and call then, thanks so much.
Correct: +1-452-547-1970, Predicted: 1970,



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Context: We’ve got a couple of things in the works, you see, we’ll send an Email as a first step, and then follow that up with a Phone call to make sure everything’s clear.
Correct: ['Email', 'Phone'], Predicted: ['Email']

Context: Well, let me think a moment, ah yes, my business phone number is +91-315-620-2304, you can reach me there anytime during business hours.
Correct: +91-315-620-2304, Predicted: 2304,

Context: Of course, I understand the need for contact information, so yes, my number for business purposes is +91-957-634-5668, feel free to reach out whenever you need.
Correct: +91-957-634-5668, Predicted: 5668,

Context: Actually, for anything business related, you're best off calling my dedicated number, which I have made very easy to remember because it’s +1-372-816-8684, so don't hesitate to use that!
Correct: +1-372-816-8684, Predicted: 8684,

Context: "Okay, no problem at all! My business phone number, which you need for verification, is +33-735-372-6411. Just let me kno

## Fine-tune T5-Model (generative)

In [None]:
!pip install datasets
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

# Prepare data
def prepare_data_t5(df):
    records = []
    for _, row in df.iterrows():
        question = row["question"]
        context = row["context"]
        intended_answer = row["intended_answer"]
        if intended_answer is None:
            intended_answer = row["context"] #TEXT questions

        if isinstance(intended_answer, list):
            target_text = ", ".join(intended_answer) # MULTI_SELECT: Combine answers
        else:
            target_text = str(intended_answer)  # Other types

        # Füge das formatierte Beispiel hinzu
        input_text = f"question: {question} context: {context}"
        records.append({"input_text": input_text, "target_text": target_text})

    return pd.DataFrame(records)

prepared_data = prepare_data_t5(data)

# Split the data
train_data, val_data = train_test_split(prepared_data, test_size=0.2, random_state=42)

# Convert to Huggingface Datasets
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Tokenize
model_name = "t5-small"  # Alternativen: "t5-base", "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["target_text"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

print(tokenized_train_dataset)
print(len(tokenized_train_dataset))

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/1132 [00:00<?, ? examples/s]

Map:   0%|          | 0/283 [00:00<?, ? examples/s]

Dataset({
    features: ['input_text', 'target_text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1132
})
1132


In [None]:
# Initialize model
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


In [None]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Verwendetes Gerät: {device}")

Verwendetes Gerät: cuda


In [None]:
# Start training
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.219423
2,No log,0.145591
3,No log,0.129408


TrainOutput(global_step=426, training_loss=1.4509462168518925, metrics={'train_runtime': 114.4024, 'train_samples_per_second': 29.685, 'train_steps_per_second': 3.724, 'total_flos': 459620757798912.0, 'train_loss': 1.4509462168518925, 'epoch': 3.0})

In [None]:
# Save model
from google.colab import drive
drive.mount('/content/drive')
!cp -r ./results /content/drive/MyDrive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Generate pipeline
model_path = '/content/drive/MyDrive/results/checkpoint-426'
model = model_path
tokenizer = model_path

t5_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Device set to use cuda:0


In [None]:
predictions = t5_pipeline(tokenized_val_dataset["input_text"])

correct_predictions = sum([pred == target for pred, target in zip(predictions, tokenized_val_dataset["target_text"])])
accuracy = correct_predictions / len(predictions)
print(f"Accuracy auf dem Valuation-Dataset: {accuracy}")

Accuracy auf dem Valuation-Dataset: 0.0


## Falsches Model trainiert (Text-Klassifizierung)

In [None]:
# from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# import torch
# from torch.utils.data import Dataset

# # Prepare new dataset with labels
# def prepare_data(df):
#     records = []
#     for _, row in df.iterrows():
#         question = row["question"]
#         context = row["context"]
#         intended_answer = row["intended_answer"]
#         options = row["options"]
#         if options is None:
#             options = row["intended_answer"] #Date und Number
#             if intended_answer is None:
#                 options = row["context"] #Text

#         for option in options:
#             label = int(option == intended_answer)
#             records.append({"text": f"{question} {context} {option}", "label": label})

#     return pd.DataFrame(records)

# prepared_df = prepare_data(data)

# print(prepared_df.size)
# pd.set_option('display.max_colwidth', None)
# prepared_df.head()

54186


Unnamed: 0,text,label
0,"Data processing consent Absolutely, I'm totally okay with data processing, so yes. Yes",1
1,"Data processing consent Absolutely, I'm totally okay with data processing, so yes. No",0
2,"Data processing consent Yeah, you have my full consent for data processing, that's a yes from me. Yes",1
3,"Data processing consent Yeah, you have my full consent for data processing, that's a yes from me. No",0
4,"Data processing consent Oh, data processing consent? Yes, you can go right ahead with that, no problem at all. Yes",1


In [None]:
# # Stratified sampling
# half_df,_ = train_test_split(
#     prepared_df,
#     train_size=0.2, #~10.000 Einträge
#     stratify=prepared_df["label"],
#     random_state=42,
# )

# # Split into training and validation data
# train_df, val_df = train_test_split(
#     half_df,
#     train_size=0.8,  # 80% Training, 20% Validation
#     stratify=half_df["label"],
#     random_state=42,
# )
# train_df.size

8668

In [None]:
# # Tokenize data
# model_checkpoint = "deepset/roberta-base-squad2"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# def tokenize_function(examples):
#     return tokenizer(examples["text"].tolist(), padding="max_length", truncation=True, return_tensors="pt")

# train_encodings = tokenize_function(train_df)
# val_encodings = tokenize_function(val_df)

# # Extract labels
# train_labels = train_df["label"].values
# val_labels = val_df["label"].values

# # Create custom dataset
# class CustomDataset(Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __len__(self):
#         return len(self.labels)

#     def __getitem__(self, idx):
#         item = {key: val[idx] for key, val in self.encodings.items()}
#         item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
#         return item

# train_dataset = CustomDataset(train_encodings, train_labels)
# val_dataset = CustomDataset(val_encodings, val_labels)

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
# # Initialize model
# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# # Define training arguments
# training_args = TrainingArguments(
#     fp16=True,
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,  # Erhöht
#     per_device_eval_batch_size=16,
#     gradient_accumulation_steps=2,  # Optional: Für größere effektive Batches
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=500,  # Weniger Logs schreiben
#     save_steps=2000,  # Modelle seltener speichern
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     warmup_steps=500,  # Optional: Lernrate langsam ansteigen lassen
#     lr_scheduler_type="cosine",  # Optional: Lernraten-Scheduler
# )


# # Initialize trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer
# )

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Verwendetes Gerät: {device}")

Verwendetes Gerät: cuda


In [None]:
# # Start training
# trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,No log,0.083845
2,No log,0.039063


TrainOutput(global_step=405, training_loss=0.2922822928722994, metrics={'train_runtime': 217.2832, 'train_samples_per_second': 59.839, 'train_steps_per_second': 1.864, 'total_flos': 1712008660230144.0, 'train_loss': 0.2922822928722994, 'epoch': 2.981549815498155})

In [None]:
# Save model
from google.colab import drive
drive.mount('/content/drive')
!cp -r ./results /content/drive/MyDrive


Mounted at /content/drive


In [None]:
# from transformers import pipeline
# from transformers import AutoModelForSequenceClassification, AutoTokenizer

# # Pfad zu deinem trainierten Modell
# model_path = '/content/drive/MyDrive/results/checkpoint-405'

# # Modell und Tokenizer laden
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained(model_path)


# # Erstelle eine Pipeline mit dem trainierten Modell
# qa_pipeline_ft = pipeline("question-answering", model=model, tokenizer=tokenizer)

Device set to use cuda:0
The model 'DistilBertForSequenceClassification' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FalconForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'GPTNeoXForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswer

In [None]:
# accuracy = predict_answers(data, qa_pipeline_ft)
# print(f"Accuracy of fine-tuned model: {accuracy * 100:.2f} %")

[INFO] Printing only incorrectly predicted answers.


TypeError: TextClassificationPipeline.__call__() missing 1 required positional argument: 'inputs'

## Evaluate continuous text

## Interesting Findings

*   Prediction of names very bad, because no deeper meaning --> fixed by checking for exact matches
  * Maybe implement name interpreter later?
*   Numerical values (size of company) prediction very bad

* QA Pipelines
  * Pipeline 2 und 3 haben nur eine accuracy von ungefähr 60 %

