In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Select device: prefer MPS on macOS, then CUDA, then CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available(): 
    device = torch.device("cuda")               
else:
    device = torch.device("cpu")

print("Using:", device)




Using: mps


In [4]:
# Use a German BERT model from Hugging Face
model_name = "dbmdz/bert-base-german-cased"

# Load tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,  # adjust if your dataset has a different number of labels
    ignore_mismatched_sizes=True,
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [5]:
import pandas as pd

df = pd.read_csv('../data/data_processed/all_data.csv')
df.head()

Unnamed: 0,filename,text,label
0,contract_7.txt,kaufvertrag verkäufer mörth ag käufer klotz ...,contract
1,complaint_58.txt,reklamation falsche lieferung unsere bestell...,complaint
2,contract_15.txt,arbeitsvertrag arbeitgeber schmid küng co. ...,contract
3,invoice_66.txt,rechnung - plath kunde dipl.-ing. leonardo sc...,invoice
4,complaint_64.txt,mangelanzeige bei der lieferung vom 20.01.2025...,complaint


In [6]:
df = df[['text','label']]
df.head()

Unnamed: 0,text,label
0,kaufvertrag verkäufer mörth ag käufer klotz ...,contract
1,reklamation falsche lieferung unsere bestell...,complaint
2,arbeitsvertrag arbeitgeber schmid küng co. ...,contract
3,rechnung - plath kunde dipl.-ing. leonardo sc...,invoice
4,mangelanzeige bei der lieferung vom 20.01.2025...,complaint


In [7]:
df=df.dropna()

In [8]:
from sklearn.preprocessing import LabelEncoder
# Initialize the label encoder
label_encoder = LabelEncoder()
# Encode the labels
df['label'] = label_encoder.fit_transform(df['label'])
df.head()

Unnamed: 0,text,label
0,kaufvertrag verkäufer mörth ag käufer klotz ...,1
1,reklamation falsche lieferung unsere bestell...,0
2,arbeitsvertrag arbeitgeber schmid küng co. ...,1
3,rechnung - plath kunde dipl.-ing. leonardo sc...,2
4,mangelanzeige bei der lieferung vom 20.01.2025...,0


In [9]:
from sklearn.model_selection import train_test_split
# Step 4: Split into train, validation, test sets (80 / 10 / 10)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)
# No further splitting here — we already have pre-split train/test CSVs
print(f'Training set size: {len(train_df)}')
print(f'Testing set size: {len(test_df)}')
print(f'Validating set size: {len(val_df)}')

Training set size: 400
Testing set size: 50
Validating set size: 50


In [10]:
from datasets import DatasetDict, Dataset
# convert the merged dataframe to a dictionary
data_train_dict = train_df.to_dict('list')
dataset_train = Dataset.from_dict(data_train_dict)
data_train_dict = test_df.to_dict('list')
dataset_test = Dataset.from_dict(data_train_dict)
data_train_dict = val_df.to_dict('list')
dataset_validation = Dataset.from_dict(data_train_dict)
# create a dataset dictionary
dataset = DatasetDict({'train': dataset_train,'test':dataset_test,'validation':dataset_validation})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 400
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})

In [11]:
text_p = []
max_length=512
def tokenize_data(example):
    try:
        return tokenizer(str(example['text']),
                         padding='max_length',
                         truncation=True,
                         max_length=max_length,
                      )
    #truncation=True, padding=True ,max_length=128, return_overflowing_tokens=True,
    except:
        print("Error")

In [12]:
dataset = dataset.map(tokenize_data ,remove_columns=["text"])

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [13]:
from transformers import TrainingArguments
model_path = "./../models/bert_finetuned_company_dataset"
training_args = TrainingArguments(
    model_path,
    num_train_epochs=3,                    # Better results
    save_total_limit=1,
    learning_rate=3e-5,
    save_strategy="epoch",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,         # Eval can handle more
    gradient_accumulation_steps=2,         # Effective batch_size=16
    dataloader_num_workers=0,              # Simpler, fewer bugs
    fp16=False,
    logging_steps=50,
    report_to="none"
)

In [15]:
#number 32218
train_dataset = dataset['train']
eval_dataset = dataset['test']


In [16]:
import numpy as np
from evaluate import load
from sklearn.metrics import precision_recall_fscore_support

metric = load("accuracy")

preds = []

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    preds.append(eval_pred)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true=labels, y_pred=predictions, average='weighted'
    )

    accuracy = metric.compute(predictions=predictions, references=labels)

    return {
        'accuracy': accuracy['accuracy'],  # <- note: dict from `evaluate`
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

Using the latest cached version of the module from /Users/harsh/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Tue Nov  4 03:33:53 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
    
)


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.089995,0.8,0.666667,0.8,0.72
2,0.302800,0.008862,0.8,0.667857,0.8,0.720513
3,0.302800,0.005734,0.8,0.667857,0.8,0.720513


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=75, training_loss=0.20503752569357553, metrics={'train_runtime': 189.5759, 'train_samples_per_second': 6.33, 'train_steps_per_second': 0.396, 'total_flos': 315738936115200.0, 'train_loss': 0.20503752569357553, 'epoch': 3.0})