## Two step RAG method

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import openai
from transformers import AutoTokenizer
import os
import utils

  from .autonotebook import tqdm as notebook_tqdm


### Loading and cleaning the data

In [2]:
few_shot_examples = 'groundtruth_classifications.xlsx'
text_data = 'full_data_filtered.csv'

data = pd.read_csv(text_data)
examples = pd.read_excel(few_shot_examples)

In [3]:
examples

Unnamed: 0,File name,Company,Climate,Litigation,Climate Litigation,General risk,Specific lawsuit(s),Note,Paragraph
0,AIG_0000005272-19-000023,AIG,No,Yes,No,Yes,No,,Pricing for our products is subject to our abi...
1,AIG_0000005272-19-000023,AIG,No,Yes,No,Yes,No,,We are exposed to certain risks if we are unab...
2,AIG_0000005272-19-000023,AIG,No,No,No,No,No,,If our businesses do not perform well and/or t...
3,AIG_0000005272-19-000023,AIG,Yes,No,No,Yes,No,,We recognize that climate change has implicati...
4,Chevron_0000093410-24-000014,Chevron,Yes,No,No,Yes,No,,Petroleum industry operations and profitabilit...
...,...,...,...,...,...,...,...,...,...
61,,,,,,,,,
62,,,,,,,,,
63,,,,,,,,,
64,,,,Non-climate litigation,,32,,,


In [4]:
# Clean up the examples
examples = examples[examples['File name'].notna()]
examples = examples.drop(columns=['Note'])
examples['year'] = examples['File name'].str.extract(r'-(\d{2})-')
examples = examples.replace({'Yes': 1, 'No': 0})

  examples = examples.replace({'Yes': 1, 'No': 0})


In [5]:
litigation_examples = examples.drop(columns=['Climate', 'Litigation', 'General risk', 'Specific lawsuit(s)', 'File name'])
litigation_examples.rename(columns={'Paragraph': 'text', 'Company': 'company', 'Climate Litigation': 'climate_litigation'}, inplace=True)

In [6]:
data.rename(columns={'text': 'text', 'folder': 'company'}, inplace=True)
data.drop(columns=['folderfiletext'], inplace=True, errors='ignore')

In [7]:
print(f"Loaded {len(data)} documents and {len(litigation_examples)} ground truth examples")


Loaded 528 documents and 61 ground truth examples


### Chunking the text

In [8]:
tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1")

In [9]:
print("Chunking documents...")
expanded_rows = []
for _, row in data.iterrows():
    expanded_rows.extend(utils.tokenize_and_chunk(row, tokenizer))
df = pd.DataFrame(expanded_rows)

print("Chunking ground truth examples...")
groundtruth_expanded = []
for _, row in litigation_examples.iterrows():
    groundtruth_expanded.extend(utils.tokenize_and_chunk(row, tokenizer))
groundtruth_df = pd.DataFrame(groundtruth_expanded)

print(f"Created {len(df)} document chunks and {len(groundtruth_df)} ground truth chunks")

Chunking documents...


Token indices sequence length is longer than the specified maximum sequence length for this model (44218 > 8192). Running this sequence through the model will result in indexing errors


Chunking ground truth examples...
Created 145734 document chunks and 61 ground truth chunks


### Setting up embedding

In [10]:
print("Loading embedding model...")
embedding_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

Loading embedding model...


<All keys matched successfully>
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
print("Encoding document embeddings...")
doc_embeddings = utils.encode_in_batches(df['text'].tolist(), embedding_model, batch_size=10)

print("Encoding ground truth embeddings...")
gt_embeddings = utils.encode_in_batches(groundtruth_df['text'].tolist(), embedding_model, batch_size=10)

Encoding document embeddings...


Embedding:   1%|          | 125/14574 [01:37<3:07:11,  1.29it/s]


KeyboardInterrupt: 

In [None]:
doc_embeddings, valid_doc_idx = doc_embeddings
gt_embeddings, valid_gt_idx = gt_embeddings

In [None]:
np.save("doc_embeddings.npy", doc_embeddings)
np.save("gt_embeddings.npy", gt_embeddings)

### Using the AI model

In [12]:
doc_embeddings = np.load("doc_embeddings.npy")
gt_embeddings = np.load("gt_embeddings.npy")

In [13]:
df["embedding"] = list(doc_embeddings)
groundtruth_df["embedding"] = list(gt_embeddings)

In [14]:
company_groups = df.groupby('company')

In [15]:
OPEN_ROUTER_KEY = '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'
client = openai.OpenAI(
    api_key=OPEN_ROUTER_KEY,
    base_url="https://openrouter.ai/api/v1"
)

In [20]:
for company, company_df in company_groups:
    # Ensure company-level folder exists
    for year, year_df in company_df.groupby('year'):
        # Define nested output directory for company and year
        nested_output_dir = os.path.join("rag_results", company, str(year))
        os.makedirs(nested_output_dir, exist_ok=True)

        # Run classification for this year subset
        results_df = utils.run_rag_classification_for_company(
            embedding_model=embedding_model,
            groundtruth_df=groundtruth_df,
            client=client,
            company_df=year_df.reset_index(drop=True),
            company_name=company,
            retrieval_k=100,
            example_k=5,
            start_index=0,
            output_dir=nested_output_dir
        )


🔍 Processing company: AEP with 470 chunks
Index(['company', 'year', 'text', 'embedding'], dtype='object')
Classifying 100 candidate chunks for AEP
[ERROR] Failed to classify paragraph: ENVIRONMENTAL ISSUES
We are implementing a substantial capital investment progra...
Exception: Error code: 401 - {'error': {'message': 'No auth credentials found', 'code': 401}}
[AEP][Year 2014] Chunk 1/100: ENVIRONMENTAL ISSUES
We are implementing a substan... -> SKIPPED
[ERROR] Failed to classify paragraph: 376
The Registrant Subsidiaries are engaged in litigation about environmental is...
Exception: Error code: 401 - {'error': {'message': 'No auth credentials found', 'code': 401}}
[AEP][Year 2014] Chunk 2/100: 376
The Registrant Subsidiaries are engaged in lit... -> SKIPPED


KeyboardInterrupt: 