## BERT based approach

In [17]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer, util
import os

### Load and prepare data

In [18]:
data = pd.read_csv("full_data_filtered.csv")
data.rename(columns={'text': 'text', 'folder': 'company'}, inplace=True)
data.drop(columns=['folderfiletext'], inplace=True, errors='ignore')
print(f"Loaded {len(data)} documents.")

Loaded 528 documents.


In [19]:
climate_model_name = "climatebert/distilroberta-base-climate-detector"
climate_model = AutoModelForSequenceClassification.from_pretrained(climate_model_name)
climate_tokenizer = AutoTokenizer.from_pretrained(climate_model_name, max_len=512)
climate_pipe = pipeline("text-classification", model=climate_model, tokenizer=climate_tokenizer, device=0)

Device set to use mps:0


In [20]:
legal_model = SentenceTransformer("nlpaueb/legal-bert-base-uncased")

No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


In [21]:
seed_phrases = [
    # Climate litigation-specific
    "We have been named in lawsuits related to climate change.",
    "Legal proceedings seek damages for climate-related impacts.",
    "Municipalities have filed suits against the company over emissions.",
    "We face public nuisance and product liability claims.",
    "These lawsuits include claims under state consumer protection statutes.",
    "Ongoing litigation alleges misleading ESG disclosures.",
    "Litigation risks could materially impact our financial condition.",
    "We have received investigative requests from government entities.",
    "Subject to congressional inquiry on climate policies.",
    "The SEC is reviewing ESG-related statements for greenwashing.",
    "Facing regulatory action over environmental claims.",
    "Increased legal and regulatory scrutiny related to climate risks.",
    "Lawsuits allege our ESG disclosures are false or misleading.",
    "Greenwashing claims brought under consumer protection laws.",
    "Allegations of deceptive climate-related marketing.",
    "Legal challenges to our pipeline and infrastructure permits.",
    "Efforts to shut down operations through litigation.",
    "Court action on easement compliance threatens project continuity.",
    "These are unprecedented legal theories with uncertain outcomes.",
    "We cannot predict the impact of climate litigation.",
    "Significant uncertainty surrounds the scope of legal claims.",
    
    # General litigation-related
    "The company is subject to ongoing litigation.",
    "Legal risks may impact our financial performance.",
    "We are defending against multiple lawsuits.",
    "Pending legal actions could result in significant costs.",
    "The outcome of litigation is inherently uncertain.",
    "Litigation could adversely affect our business operations.",
    "We are involved in regulatory enforcement proceedings.",
    "Certain lawsuits have been dismissed or remain pending.",
    "We may incur legal liabilities or penalties.",
    "Litigation expenses are expected to increase in the future."
]


## Tokenizing and chunking

In [22]:
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

In [23]:
def tokenize_and_chunk(row, tokenizer, max_tokens=512, text_col='text'):
    sentences = re.split(r'(?<=[.!?]) +', row[text_col])
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        token_count = len(tokenizer.tokenize(sentence))

        if current_tokens + token_count <= max_tokens:
            current_chunk.append(sentence)
            current_tokens += token_count
        else:
            if current_chunk:
                chunk_row = row.to_dict()
                chunk_row[text_col] = ' '.join(current_chunk)
                chunks.append(chunk_row)
            current_chunk = [sentence]
            current_tokens = token_count

    if current_chunk:
        chunk_row = row.to_dict()
        chunk_row[text_col] = ' '.join(current_chunk)
        chunks.append(chunk_row)

    return chunks

In [24]:
expanded_rows = []
for _, row in data.iterrows():
    expanded_rows.extend(tokenize_and_chunk(row, tokenizer))

Token indices sequence length is longer than the specified maximum sequence length for this model (685 > 512). Running this sequence through the model will result in indexing errors


In [25]:
df = pd.DataFrame(expanded_rows)

## Finding Relevant Chunks

In [26]:
print("Classifying chunks with ClimateBERT...")
climate_results = []

# Process in batches to avoid memory issues
batch_size = 32
for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df["text"][i:i+batch_size].tolist()
    batch_results = climate_pipe(batch_texts, padding=True, truncation=True)
    climate_results.extend(batch_results)

Classifying chunks with ClimateBERT...


100%|██████████| 4515/4515 [35:36<00:00,  2.11it/s]


In [30]:
#add the climate results to the dataframe
df['climate_label'] = [result['label'] for result in climate_results]
df['climate_score'] = [result['score'] for result in climate_results]

In [34]:
climate_df = df[df['climate_label'] == 'yes'].copy()

In [36]:
print(f"{len(climate_df)} chunks passed the climate relevance filter.")

# print the percentage of chunks that passed the climate relevance filter
total_chunks = len(df)
print(f"Percentage of chunks that passed the climate relevance filter: {len(climate_df) / total_chunks * 100:.2f}%")

44086 chunks passed the climate relevance filter.
Percentage of chunks that passed the climate relevance filter: 30.52%


In [37]:
print("Encoding seed phrases with LegalBERT...")
seed_embeddings = legal_model.encode(seed_phrases, convert_to_tensor=True, normalize_embeddings=True)

print("Encoding climate-relevant chunks with LegalBERT...")
climate_embeddings_legal = legal_model.encode(climate_df["text"].tolist(), batch_size=16, convert_to_tensor=True, normalize_embeddings=True)

Encoding seed phrases with LegalBERT...
Encoding climate-relevant chunks with LegalBERT...


In [38]:
similarity_matrix = util.cos_sim(climate_embeddings_legal, seed_embeddings)
max_similarities = similarity_matrix.max(dim=1).values
climate_df["litigation_score"] = max_similarities.cpu().numpy()

In [5]:
import os
import pandas as pd
#load in litigation df 
csv = 'BERT_results.csv'
if os.path.exists(csv):
    climate_df = pd.read_csv(csv)

In [7]:
litigation_df = climate_df[climate_df["litigation_score"] > 0.88].copy()  # Adjust threshold as needed
print(f"{len(litigation_df)} chunks flagged as climate litigation.")

432 chunks flagged as climate litigation.


In [51]:
litigation_df

Unnamed: 0,company,year,text,climate_label,climate_score,litigation_score
18,AEP,2014,"6 Repositioning Efforts\nIn April 2012, we ini...",yes,0.997724,0.879293
19,AEP,2014,ENVIRONMENTAL ISSUES\nWe are implementing a su...,yes,0.998389,0.886057
21,AEP,2014,"8\nIn addition, we are in the process of obtai...",yes,0.997970,0.879882
24,AEP,2014,Supreme Court. Nearly all of the states in wh...,yes,0.996630,0.875335
27,AEP,2014,Supreme Court was granted in June 2013. Separ...,yes,0.995758,0.875759
...,...,...,...,...,...,...
144230,XOM,2024,This is due to the exclusion from capitalized ...,yes,0.995250,0.875785
144240,XOM,2024,For additional information on natural gas liqu...,yes,0.990576,0.882572
144262,XOM,2025,The continued adoption of similar legal practi...,yes,0.998388,0.876660
144431,XOM,2025,This is due to the exclusion from capitalized ...,yes,0.995250,0.875785


In [52]:
print("\nSample climate litigation chunks:")
for idx, row in litigation_df.head(3).iterrows():
    print(f"Company: {row['company']}")
    print(f"Climate Score: {row['climate_score']:.3f}")
    print(f"Litigation Score: {row['litigation_score']:.3f}")
    print(f"Text: {row['text'][:200]}...")
    print("-" * 80)


Sample climate litigation chunks:
Company: AEP
Climate Score: 0.998
Litigation Score: 0.879
Text: 6 Repositioning Efforts
In April 2012, we initiated a process to identify strategic repositioning opportunities and efficiencies that resulted in sustainable cost savings.  This process included evalu...
--------------------------------------------------------------------------------
Company: AEP
Climate Score: 0.998
Litigation Score: 0.886
Text: ENVIRONMENTAL ISSUES
We are implementing a substantial capital investment program and incurring additional operational costs to comply with environmental control requirements.  We will need to make ad...
--------------------------------------------------------------------------------
Company: AEP
Climate Score: 0.998
Litigation Score: 0.880
Text: 8
In addition, we are in the process of obtaining permits and other necessary regulatory approvals for either the conversion of some of our coal units to natural gas or installing emission control equ...

In [53]:
# Save the final DataFrame to a CSV file
output_file = "BERT_results.csv"
litigation_df.to_csv(output_file, index=False)