In [18]:
import nltk
nltk.download('punkt')
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm  # To show progress

# Download NLTK data
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")
model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

# Create the pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Define a class to parse PDF documents from file path using PyMuPDF
class PDFParser:
    def __init__(self, file_path):
        self.file_path = file_path
        self.text = self._extract_text()

    def _extract_text(self):
        doc = fitz.open(self.file_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text

    def get_text(self):
        return self.text

    def get_text_clean(self):
        text = self.text
        text = re.sub(r'\n+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def get_text_clean_list(self):
        text = self.get_text_clean()
        text_list = sent_tokenize(text)
        return text_list

# Function to split long sentences into smaller chunks
def split_into_chunks(text, max_length):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        chunk = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk)
    return chunks

# Function to run the classifier on a PDF file
def run_classifier(file_path):
    pp = PDFParser(file_path)
    sentences = pp.get_text_clean_list()
    print(f"The annual report has {len(sentences):,d} sentences")

    # Split long sentences into chunks
    chunks = []
    for sentence in sentences:
        if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
            chunks.extend(split_into_chunks(sentence, tokenizer.model_max_length))
        else:
            chunks.append(sentence)

    # Process chunks in batches
    results = []
    batch_size = 32  # Adjust the batch size according to your memory capacity
    for i in tqdm(range(0, len(chunks), batch_size)):
        batch = chunks[i:i + batch_size]
        # Ensure each chunk is within the max length
        batch = [chunk if len(tokenizer.encode(chunk)) <= tokenizer.model_max_length else chunk[:tokenizer.model_max_length] for chunk in batch]
        results.extend(classifier(batch))

    df = pd.DataFrame(results)
    return df

# Example usage with a downloaded PDF file
file_path = "tata steel annual report.pdf" 
df = run_classifier(file_path)
print(df.groupby(['label']).mean().sort_values('score', ascending=False))

Token indices sequence length is longer than the specified maximum sequence length for this model (748 > 512). Running this sequence through the model will result in indexing errors


The CSR report has 2,154 sentences


  0%|          | 0/69 [00:00<?, ?it/s]

                                                 score
label                                                 
Employee_Engagement_Inclusion_And_Diversity   0.774872
Human_Rights_And_Community_Relations          0.718000
Labor_Practices                               0.676128
Business_Ethics                               0.668220
Management_Of_Legal_And_Regulatory_Framework  0.650603
Supply_Chain_Management                       0.601165
Systemic_Risk_Management                      0.584649
Data_Security                                 0.564688
Director_Removal                              0.532769
Access_And_Affordability                      0.520689
Physical_Impacts_Of_Climate_Change            0.520122
Product_Design_And_Lifecycle_Management       0.518621
Energy_Management                             0.508104
Waste_And_Hazardous_Materials_Management      0.445777
Employee_Health_And_Safety                    0.433264
Competitive_Behavior                          0.404748
Business_M

In [21]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")
model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

# Create the pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, truncation=True, padding=True)

# Function to extract and pre-process text from a PDF file
def extract_and_preprocess_text(file_path):
    document = fitz.open(file_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")
    
    # Basic preprocessing to remove text lines that look like tables or image captions
    lines = text.split('\n')
    filtered_lines = []
    for line in lines:
        # Ignore lines with numbers/tables
        if re.match(r'^\d+(\.\d+)*$', line.strip()):
            continue
        # Ignore lines that are likely image captions
        if re.match(r'Figure \d+|Table \d+', line.strip()):
            continue
        filtered_lines.append(line)
    
    filtered_text = '\n'.join(filtered_lines)
    sentences = sent_tokenize(filtered_text)
    return sentences

# Function to split long sentences into smaller chunks
def split_into_chunks(text, max_length):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        chunk = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk)
    return chunks

# Function to run the classifier on a PDF file
def run_classifier(file_path):
    sentences = extract_and_preprocess_text(file_path)
    print(f"The annual report has {len(sentences):,d} sentences")

    # Split long sentences into chunks
    chunks = []
    for sentence in sentences:
        if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
            chunks.extend(split_into_chunks(sentence, tokenizer.model_max_length))
        else:
            chunks.append(sentence)

    # Process chunks in batches
    results = []
    batch_size = 32  
    for i in tqdm(range(0, len(chunks), batch_size)):
        batch = chunks[i:i + batch_size]
        results.extend(classifier(batch))

    df = pd.DataFrame(results)
    return df

# Example usage with a downloaded PDF file
file_path = "tata steel annual report.pdf" 
df = run_classifier(file_path)
print(df.groupby(['label']).mean().sort_values('score', ascending=False))

Token indices sequence length is longer than the specified maximum sequence length for this model (649 > 512). Running this sequence through the model will result in indexing errors


The CSR report has 2,154 sentences


  0%|          | 0/69 [00:00<?, ?it/s]

                                                 score
label                                                 
Human_Rights_And_Community_Relations          0.720804
Employee_Engagement_Inclusion_And_Diversity   0.685169
Business_Ethics                               0.673271
Labor_Practices                               0.670070
Management_Of_Legal_And_Regulatory_Framework  0.654894
Supply_Chain_Management                       0.610824
Systemic_Risk_Management                      0.584443
Data_Security                                 0.564688
Director_Removal                              0.532877
Product_Design_And_Lifecycle_Management       0.524354
Physical_Impacts_Of_Climate_Change            0.520122
Access_And_Affordability                      0.510279
Energy_Management                             0.504862
Employee_Health_And_Safety                    0.431209
Waste_And_Hazardous_Materials_Management      0.416364
Competitive_Behavior                          0.405797
Business_M

In [24]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")
model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

# Create the pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, truncation=True, padding=True)

# Function to extract and pre-process text from a PDF file
def extract_and_preprocess_text(file_path):
    document = fitz.open(file_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")
    
    # Basic preprocessing to remove text lines that look like tables or image captions
    lines = text.split('\n')
    filtered_lines = []
    for line in lines:
        # Ignore lines with numbers/tables
        if re.match(r'^\d+(\.\d+)*$', line.strip()):
            continue
        # Ignore lines that are likely image captions
        if re.match(r'Figure \d+|Table \d+', line.strip()):
            continue
        filtered_lines.append(line)
    
    filtered_text = '\n'.join(filtered_lines)
    sentences = sent_tokenize(filtered_text)
    return sentences

# Function to run the classifier on a PDF file
def run_classifier(file_path):
    sentences = extract_and_preprocess_text(file_path)
    print(f"The annual report has {len(sentences):,d} sentences")

    # Process sentences in batches
    results = []
    batch_size = 32  # Adjust the batch size according to your memory capacity
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i + batch_size]
        results.extend(classifier(batch))

    df = pd.DataFrame(results)
    return df

# Example usage with a downloaded PDF file
file_path = "tata steel annual report.pdf"  # Replace with the actual file path
df = run_classifier(file_path)
print(df.groupby(['label']).mean().sort_values('score', ascending=False))

The CSR report has 2,154 sentences


  0%|          | 0/68 [00:00<?, ?it/s]

                                                 score
label                                                 
Employee_Engagement_Inclusion_And_Diversity   0.786158
Human_Rights_And_Community_Relations          0.720804
Business_Ethics                               0.673271
Labor_Practices                               0.670070
Management_Of_Legal_And_Regulatory_Framework  0.655406
Supply_Chain_Management                       0.610824
Systemic_Risk_Management                      0.588404
Data_Security                                 0.564688
Director_Removal                              0.532870
Energy_Management                             0.532710
Product_Design_And_Lifecycle_Management       0.524354
Physical_Impacts_Of_Climate_Change            0.520122
Access_And_Affordability                      0.510279
Employee_Health_And_Safety                    0.431209
Waste_And_Hazardous_Materials_Management      0.416364
Competitive_Behavior                          0.405797
Business_M

In [25]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nbroad/ESG-BERT")
model = AutoModelForSequenceClassification.from_pretrained("nbroad/ESG-BERT")

# Create the pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, truncation=True, padding=True)

# Function to extract and pre-process text from a PDF file
def extract_and_preprocess_text(file_path):
    document = fitz.open(file_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")
    
    # Basic preprocessing to remove text lines that look like tables or image captions
    lines = text.split('\n')
    filtered_lines = []
    for line in lines:
        # Ignore lines with numbers/tables
        if re.match(r'^\d+(\.\d+)*$', line.strip()):
            continue
        # Ignore lines that are likely image captions
        if re.match(r'Figure \d+|Table \d+', line.strip()):
            continue
        filtered_lines.append(line)
    
    filtered_text = '\n'.join(filtered_lines)
    sentences = sent_tokenize(filtered_text)
    return sentences

# Function to run the classifier on a PDF file
def run_classifier(file_path):
    sentences = extract_and_preprocess_text(file_path)
    print(f"The annual report has {len(sentences):,d} sentences")

    # Process all sentences at once
    results = classifier(sentences)
    
    df = pd.DataFrame(results)
    return df

# Example usage with a downloaded PDF file
file_path = "tata steel annual report.pdf"  # Replace with the actual file path
df = run_classifier(file_path)
print(df.groupby(['label']).mean().sort_values('score', ascending=False))

The annual report has 2,154 sentences
                                                 score
label                                                 
Employee_Engagement_Inclusion_And_Diversity   0.786158
Human_Rights_And_Community_Relations          0.720804
Business_Ethics                               0.673271
Labor_Practices                               0.670070
Management_Of_Legal_And_Regulatory_Framework  0.655406
Supply_Chain_Management                       0.610824
Systemic_Risk_Management                      0.588404
Data_Security                                 0.564688
Director_Removal                              0.532870
Energy_Management                             0.532710
Product_Design_And_Lifecycle_Management       0.524354
Physical_Impacts_Of_Climate_Change            0.520122
Access_And_Affordability                      0.510279
Employee_Health_And_Safety                    0.431209
Waste_And_Hazardous_Materials_Management      0.416364
Competitive_Behavior       