In [13]:
# prepare_dataset.py
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split

In [14]:
data_path = "../data/data_synthetic"  # folder where your synthetic txt files are stored
output_path = "../data/data_processed"
os.makedirs(output_path, exist_ok=True)

In [15]:
# Step 2: Collect and label all .txt files
files = glob.glob(os.path.join(data_path, "*.txt"))
data = []


In [16]:
if not files:
    raise FileNotFoundError(f"No .txt files found in '{data_path}'. Run make_synthetic_data.py first.")

for f in files:
    with open(f, "r", encoding="utf-8") as file:
        text = file.read().strip()
        # Extract label from filename prefix, e.g. "invoice_12.txt" -> "invoice"
        filename = os.path.basename(f)
        label = filename.split("_")[0].lower()
        data.append({"filename": filename, "text": text, "label": label})

In [17]:
df = pd.DataFrame(data)
print("ðŸ“Š Label distribution:")
print(df["label"].value_counts())

ðŸ“Š Label distribution:
label
reminder     200
order        200
invoice      200
contract     200
complaint    200
Name: count, dtype: int64


In [18]:
from tqdm import tqdm
import re

# Text preprocessing utilities
def preprocess_text(text, min_words=4):
    """Clean a single text string.
    - remove newlines
    - remove HTML tags
    - normalize whitespace
    - remove unwanted characters while keeping common punctuation and currency symbols
    - lower-case
    - return None for very short texts
    """
    if text is None:
        return None
    # ensure string
    if not isinstance(text, str):
        text = str(text)
    # Remove newline characters
    text = text.replace('\n', ' ')
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    # Keep letters, numbers, whitespace and common punctuation ($ â‚¬ % . , - /)
    text = re.sub(r"[^\w\s$â‚¬%.,/-]", ' ', text)
    # Lowercase & strip
    text = text.strip().lower()
    # Require a minimum number of words
    if len(text.split()) < min_words:
        return None
    return text


def apply_preprocessing(df, text_column='text', min_words=4):
    """Apply preprocessing to a dataframe column and drop empty results."""
    processed = []
    for t in tqdm(df[text_column].fillna('').astype(str), desc='Preprocessing'):
        processed.append(preprocess_text(t, min_words=min_words))
    df[text_column] = processed
    # Drop rows where preprocessing returned None
    df = df[df[text_column].notna()].reset_index(drop=True)
    return df

# Apply preprocessing before splitting the dataset
df = apply_preprocessing(df, text_column='text', min_words=4)
print(f"After preprocessing: {len(df)} samples")
print(df['label'].value_counts())


Preprocessing: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:00<00:00, 30806.04it/s]

After preprocessing: 1000 samples
label
reminder     200
order        200
invoice      200
contract     200
complaint    200
Name: count, dtype: int64





In [19]:
# Step 3: Save full labeled dataset
df.to_csv(os.path.join(output_path, "all_data.csv"), index=False, encoding="utf-8")


In [20]:
# Step 4: Split into train, validation, test sets (80 / 10 / 10)
# train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
# val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

In [21]:

# train_df.to_csv(os.path.join(output_path, "train.csv"), index=False, encoding="utf-8")
# val_df.to_csv(os.path.join(output_path, "val.csv"), index=False, encoding="utf-8")
# test_df.to_csv(os.path.join(output_path, "test.csv"), index=False, encoding="utf-8")


In [22]:
print("\nâœ… Data preparation complete!")
# print(f"Train set: {len(train_df)} | Validation set: {len(val_df)} | Test set: {len(test_df)}")
# print(f"Labeled CSVs saved in '{output_path}/'")


âœ… Data preparation complete!


In [23]:
df

Unnamed: 0,filename,text,label
0,reminder_172.txt,letzte mahnung vor aussenstellung rechnung re...,reminder
1,reminder_166.txt,erste mahnung mahnung zur rechnung nr. re-8676...,reminder
2,reminder_199.txt,erste mahnung mahnung zur rechnung nr. re-7506...,reminder
3,order_138.txt,bestellung bestellnummer po-2024-3137 bestell...,order
4,order_110.txt,dringende bestellung - eilt bestellnummer ei...,order
...,...,...,...
995,invoice_136.txt,abonnement-rechnung rechnung abo-14446 abrech...,invoice
996,order_135.txt,bestellung bestellnummer po-2024-3134 bestell...,order
997,complaint_2.txt,qualitÃ¤tsreklamation lieferung vom 11.11.2025...,complaint
998,reminder_143.txt,freundliche erinnerung sehr geehrte damen und ...,reminder


# Importing Dataset from Huggingface 

In [24]:
from datasets import load_dataset
import pandas as pd
import os



In [25]:
# Load the dataset from Hugging Face
dataset = load_dataset("Aoschu/donut_model_data_for_german_invoice")


In [26]:
print("Dataset structure:", dataset)
print("\nAvailable splits:", list(dataset.keys()))

Dataset structure: DatasetDict({
    train: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 97
    })
    validation: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 14
    })
    test: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 18
    })
})

Available splits: ['train', 'validation', 'test']


In [27]:
print(dataset['train'][0])

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1414x2000 at 0x309178F40>, 'ground_truth': '{"gt_parse": {"Der Name der Firma": "PaulCheung", "Die Adresse der Firma": "Paul CheungMayerhofer-Allee 812345 Freiberg", "Telefonnummer": "0123 5678-90", "Rechnungsdatum": "1234", "Summe": "7.735,00\\u20ac", "Der Name der Bank": "Freiberuflerbank", "IBAN": "DE12 3456 7890 1234 5678 90"}}'}


In [28]:
from PIL import Image

example = dataset["train"][3]
img = example["image"]
img.show()  # opens in your image viewer


In [29]:
print(dataset["train"].column_names)
example["ground_truth"]


['image', 'ground_truth']


'{"gt_parse": {"Rechnungsdatum": "21.07.2021", "Falligkeitsdatum": "04.08.2021", "Summe": "595,00:", "Rechnungsnummer": "257"}}'

In [30]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 97
    })
    validation: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 14
    })
    test: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 18
    })
})