In [1]:
from bs4 import BeautifulSoup
import os


In [2]:
def extract_excerpt_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
        soup = BeautifulSoup(content, "html.parser")
        excerpt_tag = soup.find("p", class_="excerpt")
        return excerpt_tag.get_text(strip=True) if excerpt_tag else None

In [4]:
clause_dir = "../data/clause_boxes"

clause_texts = []
clause_names = []

for fname in os.listdir(clause_dir):
    if fname.endswith(".txt"):
        full_path = os.path.join(clause_dir, fname)
        excerpt = extract_excerpt_from_txt(full_path)
        if excerpt:
            clause_texts.append(excerpt)
            clause_names.append(fname)


In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

model_path = "../contract_climate_bert" 
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)


  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at ../contract_climate_bert and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def embed_text(text, tokenizer, model, method, specific_span=(1, 25)):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    hidden = outputs.last_hidden_state

    if method == "cls":
        return hidden[:, 0, :].squeeze().numpy()
    elif method == "mean":
        return hidden.mean(dim=1).squeeze().numpy()
    elif method == "max":
        return hidden.max(dim=1).values.squeeze().numpy()
    elif method == "concat":
        cls = hidden[:, 0, :]
        mean = hidden.mean(dim=1)
        return torch.cat([cls, mean], dim=1).squeeze().numpy()
    elif method == "specific":
        start, end = specific_span
        span = hidden[:, start:end, :]
        return span.mean(dim=1).squeeze().numpy()
    else:
        raise ValueError(f"Unknown embedding method: {method}")

In [13]:
file_names = []
clause_texts = []
methods = ["cls", "mean", "max", "concat", "specific"]
embeddings = {method: [] for method in methods}

for fname in os.listdir(clause_dir):
    if not fname.endswith(".txt"):
        continue
    path = os.path.join(clause_dir, fname)
    excerpt = extract_excerpt_from_txt(path)
    if excerpt:
        file_names.append(fname)
        clause_texts.append(excerpt)
        for method in methods:
            emb = embed_text(excerpt, tokenizer, model, method)
            embeddings[method].append(emb)

In [14]:
embeddings_dir = "../CC_BERT/CC_embeddings"

In [16]:
import pickle 
import numpy as np

os.makedirs(embeddings_dir, exist_ok=True)

for method in methods:
    np.save(os.path.join(embeddings_dir, f"{method}_embeddings.npy"), np.vstack(embeddings[method]))

with open(os.path.join(embeddings_dir, "clause_texts.pkl"), "wb") as f:
    pickle.dump({
        "file_names": file_names,
        "texts": clause_texts
    }, f)