In [None]:
!pip install transformers datasets scikit-learn nltk torch -q evaluate

import pandas as pd
import numpy as np
import nltk
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
import evaluate

In [None]:
# Training a sentence classifier for ESG (Environmental, Social, and Governance) detection
nltk.download("punkt", quiet=True)
punkt_params = PunktParameters()
tokenizer_sent = PunktSentenceTokenizer(punkt_params)
def safe_sent_tokenize(text):
    return tokenizer_sent.tokenize(text)

# Load and tokenize ESG sentence dataset
dataset = load_dataset("climatebert/climate_detection")

model_name = "climatebert/distilroberta-base-climate-detector"
tokenizer_esg = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(example):
    return tokenizer_esg(example["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

# Fine-tune ESG classifier
model_esg = AutoModelForSequenceClassification.from_pretrained(model_name)
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./esg_classifier_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model_esg,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer_esg,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer_esg),
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("esg-sentence-classifier")
tokenizer_esg.save_pretrained("esg-sentence-classifier")

In [None]:
year = 2018

tokenizer_esg = AutoTokenizer.from_pretrained("esg-sentence-classifier")
model_esg = AutoModelForSequenceClassification.from_pretrained("esg-sentence-classifier")
model_esg.eval()
# Load sentiment model
tokenizer_sentiment = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-sentiment")
model_sentiment = AutoModelForSequenceClassification.from_pretrained("climatebert/distilroberta-base-climate-sentiment")
model_sentiment.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_esg.to(device)
model_sentiment.to(device)

# Define ESG + sentiment scoring function
def compute_esg_exposure_and_sentiment(text, batch_size=32):
    if not isinstance(text, str) or not text.strip():
        return 0.0, 0, 0, 0.0

    sentences = safe_sent_tokenize(text)
    total_sentences = len(sentences)
    if total_sentences == 0:
        return 0.0, 0, 0, 0.0

    # -------- Batch ESG relevance classification --------
    esg_probs = []
    for i in range(0, total_sentences, batch_size):
        batch = sentences[i:i+batch_size]
        inputs = tokenizer_esg(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model_esg(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        esg_probs.extend(probs[:, 1].cpu().tolist())  # Probabilities for ESG class

    # -------- Filter ESG-relevant sentences --------
    esg_sentences = [sent for sent, prob in zip(sentences, esg_probs) if prob > 0.5]
    esg_sentences_count = len(esg_sentences)

    # -------- Batch sentiment scoring only for ESG sentences --------
    sentiment_scores = []
    for i in range(0, esg_sentences_count, batch_size):
        batch = esg_sentences[i:i+batch_size]
        inputs = tokenizer_sentiment(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model_sentiment(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        batch_scores = (probs[:, 2] - probs[:, 0]).cpu().tolist()  # opportunity - risk
        sentiment_scores.extend(batch_scores)

    exposure_score = esg_sentences_count / total_sentences
    avg_sentiment = np.mean(sentiment_scores) if sentiment_scores else 0.0

    return exposure_score, esg_sentences_count, total_sentences, avg_sentiment

items_path = f"Data/Data_Cleaning/items_cleaned_10K_filings_{year}_1.csv"
output_path = f"Data/Climate_bert/Climatebert_10K_filings_{year}.csv"
items_df = pd.read_csv(items_path)

for item in ["Item_1", "Item_1A", "Item_7", "Item_8"]:
    print(f"Processing {item}...")
    results = items_df[item].apply(compute_esg_exposure_and_sentiment)
    items_df[f"{item}_ESG_Exposure"] = results.apply(lambda x: x[0])
    items_df[f"{item}_ESG_Sentences"] = results.apply(lambda x: x[1])
    items_df[f"{item}_Total_Sentences"] = results.apply(lambda x: x[2])
    items_df[f"{item}_ESG_Sentiment"] = results.apply(lambda x: x[3])

# Compute overall metrics
items_df["Overall_ESG_Exposure"] = items_df[
    ["Item_1_ESG_Exposure", "Item_1A_ESG_Exposure", "Item_7_ESG_Exposure", "Item_8_ESG_Exposure"]
].mean(axis=1)

items_df["Overall_ESG_Sentiment"] = items_df[
    ["Item_1_ESG_Sentiment", "Item_1A_ESG_Sentiment", "Item_7_ESG_Sentiment", "Item_8_ESG_Sentiment"]
].mean(axis=1)

os.makedirs(os.path.dirname(output_path), exist_ok=True)
columns_to_keep = [
    "year", "company", "cik",
    "Item_1_ESG_Exposure", "Item_1_ESG_Sentiment",
    "Item_1A_ESG_Exposure", "Item_1A_ESG_Sentiment",
    "Item_7_ESG_Exposure", "Item_7_ESG_Sentiment",
    "Item_8_ESG_Exposure", "Item_8_ESG_Sentiment",
    "Overall_ESG_Exposure", "Overall_ESG_Sentiment"
]
items_df[columns_to_keep].to_csv(output_path, index=False)
print(f"✅ ESG exposure + sentiment scores saved to: {output_path}")

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file esg-sentence-classifier/config.json
Model config RobertaConfig {
  "_name_or_path": "esg-sentence-classifier",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "no",
    "1": "yes"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "no": 0,
    "yes": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_cla

Processing Item_1...
Processing Item_1A...
Processing Item_7...
Processing Item_8...
✅ ESG exposure + sentiment scores saved to: Data/Climate_bert/Climatebert_10K_filings_2023.csv
