In [None]:
import os
import io
import pandas as pd
import numpy as np
import torch
import joblib
import re
import unicodedata
import contractions
import spacy
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from google.cloud import storage
import warnings
warnings.filterwarnings('ignore')

# -------------------------------
# Configs
# -------------------------------
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 1
ARTIFACTS_DIR = "model_artifacts"
BUCKET_NAME = 'contract-risk-classifier'

os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# -------------------------------
# Preprocessing
# -------------------------------
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def data_cleaning(sentence, legal_terms=None, custom_stop_words=None):
    if legal_terms is None:
        legal_terms = {'agreement': 'contract', 'parties': 'party'}
    if custom_stop_words is None:
        custom_stop_words = {'shall', 'hereinafter'}
    stop_words = nlp.Defaults.stop_words.union(custom_stop_words)

    sentence = unicodedata.normalize('NFKD', sentence).encode('ascii', 'ignore').decode('ascii')
    sentence = contractions.fix(sentence)
    sentence = sentence.lower()
    sentence = re.sub(r'http\S+|www\S+|[\w\.-]+@[\w\.-]+', '', sentence)
    sentence = re.sub(r'<[^>]+>', '', sentence)
    sentence = re.sub(r'[^a-z\s]', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()

    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words and len(token.lemma_) > 1]
    tokens = [legal_terms.get(token, token) for token in tokens]
    return ' '.join(tokens)

# -------------------------------
# Dataset
# -------------------------------
class ContractDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# -------------------------------
# Model
# -------------------------------
class ContractRiskClassifier(nn.Module):
    def __init__(self, n_classes):
        super(ContractRiskClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(MODEL_NAME)
        for param in self.bert.parameters():
            param.requires_grad = False  # Freeze all
        for param in self.bert.encoder.layer[-1].parameters():
            param.requires_grad = True  # Unfreeze last layer only
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_output)

# -------------------------------
# Train & Eval
# -------------------------------
def train_model(model, train_loader, val_loader, optimizer, loss_fn, device):
    model.train()
    for epoch in range(EPOCHS):
        print(f"\nEpoch {epoch+1} Training...")
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            output = model(input_ids, mask)
            loss = loss_fn(output, labels)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} complete.")

    print("\nEvaluating on validation set...")
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            output = model(input_ids, mask)
            predictions = output.argmax(dim=1)
            preds.extend(predictions.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    print("\nValidation Report:\n", classification_report(targets, preds, target_names=label_encoder.classes_))

# -------------------------------
# Upload to GCS
# -------------------------------
def upload_to_gcs(local_path, blob_name):
    client = storage.Client()
    bucket = client.get_bucket(BUCKET_NAME)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(local_path)
    print(f"Uploaded: {local_path} --> gs://{BUCKET_NAME}/{blob_name}")

# -------------------------------
# Main
# -------------------------------
if __name__ == "__main__":
    print("Loading CSV from GCS...")
    client = storage.Client()
    bucket = client.get_bucket(BUCKET_NAME)
    blob = bucket.blob("cleaned_contract_data.csv")
    data_bytes = blob.download_as_bytes()
    data = pd.read_csv(io.BytesIO(data_bytes))
    privacy_data = data[data['risk'] == 'Privacy'].head(1000)
    ip_data = data[data['risk'] == 'IP'].head(1000)
    compliance_data = data[data['risk'] == 'Compliance'].head(1000)
    df = pd.concat([privacy_data, ip_data, compliance_data])
    print(f"Loaded {len(df)} rows")

    print("Cleaning text...")
    df['cleaned_text'] = df['text'].apply(data_cleaning)

    print("Encoding labels...")
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['risk'])

    print("Splitting dataset...")
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    print("Creating datasets...")
    train_data = ContractDataset(train_df['cleaned_text'].tolist(), train_df['label'].tolist(), tokenizer)
    val_data = ContractDataset(val_df['cleaned_text'].tolist(), val_df['label'].tolist(), tokenizer)
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)

    print("Initializing model...")
    model = ContractRiskClassifier(n_classes=len(label_encoder.classes_))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = nn.CrossEntropyLoss()

    print("Training model...")
    train_model(model, train_loader, val_loader, optimizer, loss_fn, device)

    # -------------------------------
    # Save Artifacts Locally
    # -------------------------------
    print("\nSaving model artifacts locally...")
    torch.save(model.state_dict(), f"{ARTIFACTS_DIR}/model.pth")
    joblib.dump(tokenizer, f"{ARTIFACTS_DIR}/tokenizer.pkl")
    joblib.dump(label_encoder, f"{ARTIFACTS_DIR}/label_encoder.pkl")

    # -------------------------------
    # Upload to GCS (bucket root)
    # -------------------------------
    print("\nUploading artifacts to GCS bucket root...")
    upload_to_gcs(f"{ARTIFACTS_DIR}/model.pth", "model.pth")
    upload_to_gcs(f"{ARTIFACTS_DIR}/tokenizer.pkl", "tokenizer.pkl")
    upload_to_gcs(f"{ARTIFACTS_DIR}/label_encoder.pkl", "label_encoder.pkl")

    print("\n🎉 All done. Artifacts uploaded to GCS bucket root.")

Loading CSV from GCS...
Loaded 3000 rows
Cleaning text...
Encoding labels...
Splitting dataset...
Loading tokenizer...
Creating datasets...
Initializing model...
Training model...

Epoch 1 Training...
Epoch 1 complete.

Evaluating on validation set...

Validation Report:
               precision    recall  f1-score   support

  Compliance       1.00      0.99      1.00       200
          IP       0.96      0.99      0.97       200
     Privacy       0.98      0.95      0.97       200

    accuracy                           0.98       600
   macro avg       0.98      0.98      0.98       600
weighted avg       0.98      0.98      0.98       600


Saving model artifacts locally...

Uploading artifacts to GCS bucket root...
Uploaded: model_artifacts/model.pth --> gs://contract-risk-classifier/model.pth
Uploaded: model_artifacts/tokenizer.pkl --> gs://contract-risk-classifier/tokenizer.pkl
Uploaded: model_artifacts/label_encoder.pkl --> gs://contract-risk-classifier/label_encoder.pkl

🎉 A

In [47]:
# Single test sentence
test_sentence = "The contractor shall be liable  for any breach of confidentiality disclosed during the term of this agreement."

# Clean the text
cleaned = data_cleaning(test_sentence)

# Tokenize
encoding = tokenizer.encode_plus(
    cleaned,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

# Move to device
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

# Run inference
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask)
    pred_label_id = outputs.argmax(dim=1).cpu().numpy()[0]

# Decode label
predicted_label = label_encoder.inverse_transform([pred_label_id])[0]

print(f"🔎 Input: {test_sentence}")
print(f"📊 Predicted Risk Level: {predicted_label}")

🔎 Input: The contractor shall be liable  for any breach of  disclosed during the term of this agreement.
📊 Predicted Risk Level: Privacy


In [53]:
df['risk'].unique()

array(['Privacy', 'IP', 'Compliance'], dtype=object)