In [209]:
import pandas as pd
import re
from transformers import AutoTokenizer
import openai
import time

In [96]:
few_shot_examples = 'groundtruth_classifications.xlsx'
text_data = 'full_data_filtered.csv'

data = pd.read_csv(text_data)
examples = pd.read_excel(few_shot_examples)


In [97]:
data.columns

Index(['folder', 'year', 'text', 'folderfiletext'], dtype='object')

In [98]:
#cleaning up the examples
examples = examples[examples['File name'].notna()]

#remove the note column
examples = examples.drop(columns=['Note'])

In [99]:
#extract the year from filename; it will be two digits in this pattern: -XX-

examples['year'] = examples['File name'].str.extract(r'-(\d{2})-') 

In [100]:
#turn yeses into 1s and nos into 0s
examples = examples.replace({'Yes': 1, 'No': 0})

  examples = examples.replace({'Yes': 1, 'No': 0})


In [101]:
litigation_examples = examples.drop(columns=['Climate', 'Litigation', 'General risk', 'Specific lawsuit(s)', 'File name'])

In [102]:
litigation_examples

Unnamed: 0,Company,Climate Litigation,Paragraph,year
0,AIG,0,Pricing for our products is subject to our abi...,19
1,AIG,0,We are exposed to certain risks if we are unab...,19
2,AIG,0,If our businesses do not perform well and/or t...,19
3,AIG,0,We recognize that climate change has implicati...,19
4,Chevron,0,Petroleum industry operations and profitabilit...,24
...,...,...,...,...
56,United Airlines,1,"In addition, the Company believes it is possib...",24
57,Peadbody,1,The plaintiffs are the governing bodies of a v...,13
58,Conoco Philipps,1,"For example, in June 2007, the New York Office...",23
59,Conoco Philipps,1,"Beginning in 2017, governmental and other enti...",23


In [103]:
litigation_examples.rename(columns={'Paragraph': 'text', 'Company': 'company', 'Climate Litigation': 'climate_litigation'}, inplace=True)

In [105]:
data.rename(columns={'text': 'text', 'folder': 'company'}, inplace=True)
data.drop(columns=['folderfiletext'], inplace=True, errors='ignore')

In [106]:
data

Unnamed: 0,company,year,text
0,AEP,2014,EX-13 20 ye13aepar.htm ANNUAL REPORT ye13aepar...
1,AEP,2015,XML 119 R17.htm IDEA: XBRL DOCUMENT v2.4.1.9 B...
2,AEP,2016,EX-13 9 ye15aepar.htm ANNUAL REPORT Exhibit 20...
3,AEP,2017,EX-13 10 aep10kfrex1320164q.htm ANNUAL REPORT ...
4,AEP,2018,EX-13 16 aep10kfrex1320174q.htm ANNUAL REPORT ...
...,...,...,...
523,XOM,2021,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,..."
524,XOM,2022,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,..."
525,XOM,2023,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,..."
526,XOM,2024,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,..."


In [107]:
litigation_examples

Unnamed: 0,company,climate_litigation,text,year
0,AIG,0,Pricing for our products is subject to our abi...,19
1,AIG,0,We are exposed to certain risks if we are unab...,19
2,AIG,0,If our businesses do not perform well and/or t...,19
3,AIG,0,We recognize that climate change has implicati...,19
4,Chevron,0,Petroleum industry operations and profitabilit...,24
...,...,...,...,...
56,United Airlines,1,"In addition, the Company believes it is possib...",24
57,Peadbody,1,The plaintiffs are the governing bodies of a v...,13
58,Conoco Philipps,1,"For example, in June 2007, the New York Office...",23
59,Conoco Philipps,1,"Beginning in 2017, governmental and other enti...",23


### Chunking the text

In [108]:
tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1")

In [122]:
def tokenize_and_chunk(row, tokenizer, max_tokens=512, text_col='text'):
    sentences = re.split(r'(?<=[.!?]) +', row[text_col])
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        token_count = len(tokenizer.tokenize(sentence))

        if current_tokens + token_count <= max_tokens:
            current_chunk.append(sentence)
            current_tokens += token_count
        else:
            if current_chunk:
                chunk_row = row.to_dict()
                chunk_row[text_col] = ' '.join(current_chunk)
                chunks.append(chunk_row)
            current_chunk = [sentence]
            current_tokens = token_count

    if current_chunk:
        chunk_row = row.to_dict()
        chunk_row[text_col] = ' '.join(current_chunk)
        chunks.append(chunk_row)

    return chunks

In [111]:
expanded_rows = []
for _, row in data.iterrows():
    expanded_rows.extend(tokenize_and_chunk(row, tokenizer))

Token indices sequence length is longer than the specified maximum sequence length for this model (44218 > 8192). Running this sequence through the model will result in indexing errors


In [123]:
df = pd.DataFrame(expanded_rows)

In [124]:
groundtruth_expanded = []
for _, row in litigation_examples.iterrows():
    groundtruth_expanded.extend(tokenize_and_chunk(row, tokenizer))

In [125]:
groundtruth_df = pd.DataFrame(groundtruth_expanded)

## Using an LLM to assess the chunks

In [214]:
OPEN_ROUTER_KEY = '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'

In [215]:
client = openai.OpenAI(
    api_key=OPEN_ROUTER_KEY,
    base_url="https://openrouter.ai/api/v1"
)

In [216]:
SYSTEM_PROMPT = """You are a legal and environmental disclosure expert. Your task is to determine whether a paragraph of text qualifies as climate litigation.

Climate litigation refers to legal actions that materially concern climate change science, policy, or law. These include, but are not limited to:
- Lawsuits targeting false or misleading climate claims (e.g. greenwashing)
- Legal actions over a company’s contribution to climate-related impacts
- Efforts to force climate alignment through human rights or fiduciary duty arguments
- Failure to disclose climate-related risks or impacts
- Breaches of climate-related regulations
- Litigation seeking damages for harms caused by climate change
- Legal challenges to regulatory approvals on the basis of climate misalignment

Your classification must be binary:
- climate_litigation: 1 if the paragraph relates to litigation that is specifically about climate change
- climate_litigation: 0 otherwise

Be especially careful not to classify the following as climate litigation:
- Environmental lawsuits unrelated to climate change, such as:
  - Pollution from toxic substances (e.g., PFAS, oil spills)
  - Destruction of ecosystems not linked to climate change
  - Breaches of water, soil, or conservation laws without reference to climate change

Do not classify as climate litigation simply because the case mentions sustainability, ESG, or environmental risk. Focus only on litigation where climate change itself is central to the legal reasoning, claims, or remedies sought.
"""

In [217]:
def build_backlog(few_shot_df, max_examples=62):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    
    few_shot_subset = few_shot_df.sample(n=min(max_examples, len(few_shot_df)), random_state=42)
    for _, row in few_shot_subset.iterrows():
        messages.append({
            "role": "user",
            "content": f"Paragraph: {row['text']}\nIs this climate litigation? Respond with 'climate_litigation: 1' or 'climate_litigation: 0'"
        })
        messages.append({
            "role": "assistant",
            "content": f"climate_litigation: {row['climate_litigation']}"
        })

    return messages

In [218]:
backlog_messages = build_backlog(groundtruth_df)

In [219]:
def add_classification_request(backlog_messages, new_text):
    messages = backlog_messages.copy()

    messages.append({
        "role": "user",
        "content": f"Paragraph: {new_text}\nIs this climate litigation? Respond with 'climate_litigation: 1' or 'climate_litigation: 0'"
    })

    return messages

In [None]:
def classify_paragraph(text, backlog_messages):
    try:
        messages = add_classification_request(backlog_messages, text)

        response = client.chat.completions.create(
            model="meta-llama/llama-4-scout:free",
            messages=messages,
            temperature=0
        )

        response_dict = response.to_dict()
        content = response_dict['choices'][0]['message']['content']
        return content

    except Exception as e:
        print(f"[ERROR] Failed to classify paragraph: {text[:80]}...\nException: {e}")
        return "SKIPPED"

In [246]:
for index, row in df.iloc[2507:].iterrows():
    text = row['text']
    classification = classify_paragraph(text, backlog_messages)
    print(f"Classifying paragraph {index + 1}/{len(df)}: {text[:10]}... -> {classification}")
    time.sleep(3)
    
    #add the classification to the DataFrame
    df.at[index, 'climate_litigation'] = classification
    if index % 10 == 0:
        df.to_csv("classified_output.csv", index=False)

Classifying paragraph 2508/145734: See the ta... -> climate_litigation: 0
Classifying paragraph 2509/145734: Based on A... -> climate_litigation: 0
Classifying paragraph 2510/145734: Transource... -> climate_litigation: 0
Classifying paragraph 2511/145734: The call e... -> climate_litigation: 0
Classifying paragraph 2512/145734: The method... -> climate_litigation: 0
Classifying paragraph 2513/145734: and Subsid... -> climate_litigation: 0
Classifying paragraph 2514/145734: and Subsid... -> climate_litigation: 0
Classifying paragraph 2515/145734: (b)
Includ... -> climate_litigation: 0
Classifying paragraph 2516/145734: 319
SWEPCo... -> climate_litigation: 0
Classifying paragraph 2517/145734: Principal ... -> climate_litigation: 0
Classifying paragraph 2518/145734: PATH is a ... -> climate_litigation: 0
Classifying paragraph 2519/145734: The FERC o... -> climate_litigation: 0
Classifying paragraph 2520/145734: As of Dece... -> climate_litigation: 0
Classifying paragraph 2521/145734: AEP

KeyboardInterrupt: 

In [248]:
#find the rows that are classified as climate_litigation: 1
climate_litigation_rows = df[df['climate_litigation'] == 'climate_litigation: 1']
#save the climate litigation rows to a new CSV file
climate_litigation_rows.to_csv("classified_output_1s.csv", index=False)

In [249]:
climate_litigation_rows

Unnamed: 0,company,year,text,climate_litigation
26,AEP,2014,"10\nNumerous affected entities, states and oth...",climate_litigation: 1
29,AEP,2014,"CO 2 Regulation\nIn March 2012, the Federal EP...",climate_litigation: 1
141,AEP,2014,and the Sierra Club. The modified settlement ...,climate_litigation: 1
150,AEP,2014,ENVIRONMENTAL CONTINGENCIES\nCarbon Dioxide Pu...,climate_litigation: 1
356,AEP,2014,267\nIndemnifications and Other Guarantees – A...,climate_litigation: 1
...,...,...,...,...
13469,AIG,2021,Factors that could cause AIG’s actual results ...,climate_litigation: 1
13873,AIG,2021,"In addition, AIG Parent guarantees various obl...",climate_litigation: 1
17104,BTU,2017,"The EPA released the final rule on August 3, 2...",climate_litigation: 1
17105,BTU,2017,"On August 3, 2015, the EPA announced the final...",climate_litigation: 1
