In [1]:
#!pip install -q transformers datasets torch scikit-learn
#!pip install -q evaluate

In [2]:
# 2. Imports
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
import evaluate
import numpy as np


In [3]:
# 3. Load dataset

df = pd.read_csv("/content/scontracts.csv")

In [4]:
df.head()

Unnamed: 0,contract_id,contract_type,contract_text
0,C0001,License Agreement,This License Agreement is made on 2023-04-26 b...
1,C0002,License Agreement,This License Agreement is made on 2011-11-09 b...
2,C0003,Employment Agreement,This Employment Agreement is entered into by W...
3,C0004,Non-Disclosure Agreement,This Non-Disclosure Agreement (NDA) is made be...
4,C0005,Vendor Agreement,This Vendor Agreement is made on 1984-12-30 be...


In [5]:
 #Keep only needed columns
df = df[['contract_text', 'contract_type']]


In [6]:
#encode labels
label2id = {label: i for i, label in enumerate(df['contract_type'].unique())}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['contract_type'].map(label2id)

In [7]:
# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['contract_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)


In [8]:
from datasets import Dataset

train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [9]:
# 2. Load tokenizer
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
def tokenize(batch):
   #token max_length can be increase
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [11]:
# 3. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# 4. Training setup
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}

In [13]:
training_args = TrainingArguments(
    output_dir="./legalbert_contracts",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # reduce if GPU memory low
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False
)

In [14]:
# 5. Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mobengstoney[0m ([33mobengstoney-data-teqs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.921838,0.375,0.240741
2,2.140600,1.214441,1.0,1.0
3,1.471800,0.987058,1.0,1.0




TrainOutput(global_step=120, training_loss=1.6965452035268147, metrics={'train_runtime': 2253.3654, 'train_samples_per_second': 0.213, 'train_steps_per_second': 0.053, 'total_flos': 63151189032960.0, 'train_loss': 1.6965452035268147, 'epoch': 3.0})

In [15]:
# 9. Save model + tokenizer
trainer.save_model("./legalbert_contract_classifier")
tokenizer.save_pretrained("./legalbert_contract_classifier")

print("✅ Training complete! Model saved to ./legalbert_contract_classifier")

✅ Training complete! Model saved to ./legalbert_contract_classifier
