In [49]:
import pandas as pd
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
import string
import pdfplumber
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
custom_stopwords = set(STOP_WORDS)
nlp = spacy.load("en_core_web_sm")

In [50]:
import pandas as pd
df=pd.read_csv("../data/scontracts.csv")
df.head()

Unnamed: 0,contract_id,contract_type,contract_text
0,C0001,License Agreement,This License Agreement is made on 2023-04-26 b...
1,C0002,License Agreement,This License Agreement is made on 2011-11-09 b...
2,C0003,Employment Agreement,This Employment Agreement is entered into by W...
3,C0004,Non-Disclosure Agreement,This Non-Disclosure Agreement (NDA) is made be...
4,C0005,Vendor Agreement,This Vendor Agreement is made on 1984-12-30 be...


In [51]:
df.shape

(200, 3)

In [52]:
def clean_contract_text(text):
    """Clean contract text for better model performance"""
    text = str(text).lower()  # normalize case (optional: keep uppercase acronyms)
    
    # Remove references in square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags (if any)
    text = re.sub(r'<.*?>+', '', text)
    
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    
    # DO NOT remove numbers (keep dates, monetary values, durations)
    # DO NOT remove punctuation fully — keep . : ; () for legal structure
    # Instead, maybe only remove stray characters like @,#,$
    text = re.sub(r'[%s]' % re.escape("@#$%^&*_+=~`"), '', text)
    
    return text

def extract_text_from_pdf(file_path):
    """Extract text from PDF using pdfplumber"""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + " "
    return text


In [53]:
# --- Step 3: Apply cleaning to the contract text ---
df["cleaned_text"] = df["contract_text"].apply(clean_contract_text)

# --- Step 4: Save prepared dataset ---
df.to_csv("contracts_cleaned.csv", index=False)

df.head()

Unnamed: 0,contract_id,contract_type,contract_text,cleaned_text
0,C0001,License Agreement,This License Agreement is made on 2023-04-26 b...,this license agreement is made on 2023-04-26 b...
1,C0002,License Agreement,This License Agreement is made on 2011-11-09 b...,this license agreement is made on 2011-11-09 b...
2,C0003,Employment Agreement,This Employment Agreement is entered into by W...,this employment agreement is entered into by w...
3,C0004,Non-Disclosure Agreement,This Non-Disclosure Agreement (NDA) is made be...,this non-disclosure agreement (nda) is made be...
4,C0005,Vendor Agreement,This Vendor Agreement is made on 1984-12-30 be...,this vendor agreement is made on 1984-12-30 be...


In [58]:
# To prevent spacy from removing important words
important_words = {"may", "january", "february", "march", "april", "june", "july",
                   "august", "september", "october", "november", "december",
                   "agreement", "contract", "party", "shall", "effective", "term"}
custom_stopwords = custom_stopwords.difference(important_words)


In [55]:
# tokenization, lematatization, stop words
def token_lemema_nonstop(text, remove_stopwords=False):
     doc=nlp(text)
     output=[token.lemma_ for token in doc if token not in custom_stopwords]
     output= ' '.join(output)
     return output

In [56]:
df.cleaned_text= df.cleaned_text.apply(token_lemema_nonstop)

In [57]:
# stop words


df.head()








Unnamed: 0,contract_id,contract_type,contract_text,cleaned_text
0,C0001,License Agreement,This License Agreement is made on 2023-04-26 b...,this license agreement be make on 2023 - 04 - ...
1,C0002,License Agreement,This License Agreement is made on 2011-11-09 b...,this license agreement be make on 2011 - 11 - ...
2,C0003,Employment Agreement,This Employment Agreement is entered into by W...,this employment agreement be enter into by wal...
3,C0004,Non-Disclosure Agreement,This Non-Disclosure Agreement (NDA) is made be...,this non - disclosure agreement ( nda ) be mak...
4,C0005,Vendor Agreement,This Vendor Agreement is made on 1984-12-30 be...,this vendor agreement be make on 1984 - 12 - 3...


In [60]:
# vectorization
from sklearn.feature_extraction.text import  TfidfVectorizer

In [66]:
tfidf=TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=.2)

In [67]:
tfidf_cv=tfidf.fit_transform(df.cleaned_text)

x_df=pd.DataFrame(tfidf_cv.toarray(), columns=tfidf.get_feature_names_out())

In [68]:
x_df

Unnamed: 0,agree,agreement,agreement enter,agreement make,enter,grant,make,right,sign
0,0.000000,0.209436,0.000000,0.513779,0.000000,0.496672,0.445871,0.496672,0.0
1,0.000000,0.209436,0.000000,0.513779,0.000000,0.496672,0.445871,0.496672,0.0
2,0.000000,0.285740,0.677626,0.000000,0.677626,0.000000,0.000000,0.000000,0.0
3,0.695697,0.305404,0.000000,0.000000,0.000000,0.000000,0.650180,0.000000,0.0
4,0.556770,0.244417,0.000000,0.599593,0.000000,0.000000,0.520343,0.000000,0.0
...,...,...,...,...,...,...,...,...,...
195,0.695697,0.305404,0.000000,0.000000,0.000000,0.000000,0.650180,0.000000,0.0
196,0.000000,0.285740,0.677626,0.000000,0.677626,0.000000,0.000000,0.000000,0.0
197,0.000000,0.209436,0.000000,0.513779,0.000000,0.496672,0.445871,0.496672,0.0
198,0.556770,0.244417,0.000000,0.599593,0.000000,0.000000,0.520343,0.000000,0.0


In [70]:
# modeling 
y=df.contract_type
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

In [76]:
# split data into training and testing sets 20% for testing
X_train, X_test, y_train, y_test = train_test_split(x_df, y, test_size=0.2, random_state=42)


#intialize the model
svm_model=LinearSVC()

#train the model
svm_model.fit(X_train, y_train)

#predict the model
y_pred=svm_model.predict(X_test)

#evaluate the model
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))








                          precision    recall  f1-score   support

    Consulting Agreement       0.00      0.00      0.00         2
    Employment Agreement       1.00      1.00      1.00         4
     Franchise Agreement       1.00      1.00      1.00         6
         Lease Agreement       0.50      1.00      0.67         5
       License Agreement       1.00      1.00      1.00         7
          Loan Agreement       1.00      1.00      1.00         3
Non-Disclosure Agreement       1.00      1.00      1.00         4
   Partnership Agreement       0.00      0.00      0.00         5
 Service Level Agreement       0.50      1.00      0.67         2
        Vendor Agreement       1.00      1.00      1.00         2

                accuracy                           0.82        40
               macro avg       0.70      0.80      0.73        40
            weighted avg       0.74      0.82      0.77        40

[[0 0 0 0 0 0 0 0 2 0]
 [0 4 0 0 0 0 0 0 0 0]
 [0 0 6 0 0 0 0 0 0 0]
 [0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
#save the model and the vectorizer
import joblib
joblib.dump(svm_model, "svm_contract_classifier.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [None]:
# use pretrained model from hugging face
# I wil fine tune LegalBert for this task
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import pandas as pd

# Load tokenizer for LegalBERT
tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

class ContractDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Load your dataset
df = pd.read_csv("./data/scontracts.csv")

# Encode labels
label2id = {label: i for i, label in enumerate(df["contract_type"].unique())}
id2label = {i: label for label, i in label2id.items()}
df["label"] = df["contract_type"].map(label2id)

# Train/test split
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

train_dataset = ContractDataset(train_df["text"].tolist(), train_df["label"].tolist(), tokenizer)
test_dataset = ContractDataset(test_df["text"].tolist(), test_df["label"].tolist(), tokenizer)

# Load LegalBERT for classification
model = BertForSequenceClassification.from_pretrained(
    "nlpaueb/legal-bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


In [None]:
training_args = TrainingArguments(
    output_dir="./legalbert_contracts",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
# save the model
model.save_pretrained("./legalbert_model")
tokenizer.save_pretrained("./legalbert_model")
