In [1]:
# prepare_dataset.py
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split

In [2]:
data_path = "../data/data_synthetic"  # folder where your synthetic txt files are stored
output_path = "../data/data_processed"
os.makedirs(output_path, exist_ok=True)

In [3]:
# Step 2: Collect and label all .txt files
files = glob.glob(os.path.join(data_path, "*.txt"))
data = []


In [4]:
if not files:
    raise FileNotFoundError(f"No .txt files found in '{data_path}'. Run make_synthetic_data.py first.")

for f in files:
    with open(f, "r", encoding="utf-8") as file:
        text = file.read().strip()
        # Extract label from filename prefix, e.g. "invoice_12.txt" -> "invoice"
        filename = os.path.basename(f)
        label = filename.split("_")[0].lower()
        data.append({"filename": filename, "text": text, "label": label})

In [5]:
df = pd.DataFrame(data)
print("üìä Label distribution:")
print(df["label"].value_counts())

üìä Label distribution:
label
contract     100
complaint    100
invoice      100
order        100
reminder     100
Name: count, dtype: int64


In [6]:
from tqdm import tqdm
import re

# Text preprocessing utilities
def preprocess_text(text, min_words=4):
    """Clean a single text string.
    - remove newlines
    - remove HTML tags
    - normalize whitespace
    - remove unwanted characters while keeping common punctuation and currency symbols
    - lower-case
    - return None for very short texts
    """
    if text is None:
        return None
    # ensure string
    if not isinstance(text, str):
        text = str(text)
    # Remove newline characters
    text = text.replace('\n', ' ')
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    # Keep letters, numbers, whitespace and common punctuation ($ ‚Ç¨ % . , - /)
    text = re.sub(r"[^\w\s$‚Ç¨%.,/-]", ' ', text)
    # Lowercase & strip
    text = text.strip().lower()
    # Require a minimum number of words
    if len(text.split()) < min_words:
        return None
    return text


def apply_preprocessing(df, text_column='text', min_words=4):
    """Apply preprocessing to a dataframe column and drop empty results."""
    processed = []
    for t in tqdm(df[text_column].fillna('').astype(str), desc='Preprocessing'):
        processed.append(preprocess_text(t, min_words=min_words))
    df[text_column] = processed
    # Drop rows where preprocessing returned None
    df = df[df[text_column].notna()].reset_index(drop=True)
    return df

# Apply preprocessing before splitting the dataset
df = apply_preprocessing(df, text_column='text', min_words=4)
print(f"After preprocessing: {len(df)} samples")
print(df['label'].value_counts())


Preprocessing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 72410.47it/s]

After preprocessing: 500 samples
label
contract     100
complaint    100
invoice      100
order        100
reminder     100
Name: count, dtype: int64





In [7]:
# Step 3: Save full labeled dataset
df.to_csv(os.path.join(output_path, "all_data.csv"), index=False, encoding="utf-8")


In [8]:
# Step 4: Split into train, validation, test sets (80 / 10 / 10)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)


In [9]:

train_df.to_csv(os.path.join(output_path, "train.csv"), index=False, encoding="utf-8")
val_df.to_csv(os.path.join(output_path, "val.csv"), index=False, encoding="utf-8")
test_df.to_csv(os.path.join(output_path, "test.csv"), index=False, encoding="utf-8")


In [10]:
print("\n‚úÖ Data preparation complete!")
print(f"Train set: {len(train_df)} | Validation set: {len(val_df)} | Test set: {len(test_df)}")
print(f"Labeled CSVs saved in '{output_path}/'")


‚úÖ Data preparation complete!
Train set: 400 | Validation set: 50 | Test set: 50
Labeled CSVs saved in '../data/data_processed/'


In [11]:
df

Unnamed: 0,filename,text,label
0,contract_7.txt,kaufvertrag verk√§ufer m√∂rth ag k√§ufer klotz ...,contract
1,complaint_58.txt,reklamation falsche lieferung unsere bestell...,complaint
2,contract_15.txt,arbeitsvertrag arbeitgeber schmid k√ºng co. ...,contract
3,invoice_66.txt,rechnung - plath kunde dipl.-ing. leonardo sc...,invoice
4,complaint_64.txt,mangelanzeige bei der lieferung vom 20.01.2025...,complaint
...,...,...,...
495,complaint_55.txt,beschwerde √ºber lieferung am 12.06.2024 haben ...,complaint
496,complaint_41.txt,"reklamation sehr geehrte damen und herren, die...",complaint
497,invoice_43.txt,rechnung rechnungsnummer 1772 rechnungsdatum ...,invoice
498,contract_18.txt,dienstleistungsvertrag zwischen pohl maier kg ...,contract
