In [7]:
import fitz  # PyMuPDF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import os


In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:

# Define keywords for Environment, Social, and Governance
environment_keywords = [
    'sustainable', 'eco-friendly', 'renewable', 'carbon neutral', 'organic',
    'biodegradable', 'recycling', 'energy-efficient', 'green energy',
    'environmental impact', 'solar', 'wind', 'hydropower', 'clean energy',
    'compostable', 'low carbon', 'zero waste', 'green building', 'LEED certified',
    'carbon footprint', 'eco-conscious', 'sustainability', 'upcycled', 'green technology',
    'conservation', 'water-saving', 'eco-innovation', 'green design', 'eco-label',
    'green seal', 'energy star', 'greenhouse gases', 'environmental stewardship',
    'eco-certification', 'carbon offset', 'natural resources', 'pollution reduction',
    'sustainable agriculture', 'eco-tourism', 'alternative energy', 'clean technology',
    'eco-products', 'sustainable development', 'environmental policy', 'green practices',
    'sustainable sourcing', 'eco-initiatives', 'low impact', 'carbon reduction',
    'green standards', 'renewable resources', 'sustainable energy',
    'environmental conservation', 'green finance', 'eco-friendly packaging',
    'sustainable packaging', 'circular economy', 'eco-friendly products',
    'green logistics', 'eco-friendly transportation', 'sustainable transport',
    'environmental footprint', 'renewable energy sources', 'eco-friendly materials',
    'sustainable materials', 'eco-friendly manufacturing', 'sustainable manufacturing',
    'eco-friendly practices', 'sustainable practices', 'eco-friendly design',
    'sustainable design', 'green lifestyle', 'sustainable living', 'green living',
    'eco-friendly lifestyle', 'green certifications', 'eco-friendly business',
    'sustainable business', 'eco-friendly solutions', 'sustainable solutions',
    'eco-friendly initiatives', 'sustainable initiatives', 'green initiatives',
    'eco-friendly innovations', 'sustainable innovations', 'eco-friendly developments',
    'sustainable developments', 'green policies', 'eco-friendly policies',
    'sustainable policies', 'green strategies', 'eco-friendly strategies', 'sustainable strategies'
]

social_keywords = [
    'diversity', 'inclusion', 'equity', 'community', 'human rights',
    'social responsibility', 'labor rights', 'fair labor', 'worker rights',
    'workplace safety', 'employee welfare', 'employee engagement',
    'corporate citizenship', 'social justice', 'humanitarian', 'child labor',
    'forced labor', 'discrimination', 'equal opportunity', 'fair wages',
    'work-life balance', 'employee development', 'community development',
    'volunteering', 'social impact', 'stakeholder engagement', 'social equity',
    'gender equality', 'racial equality', 'LGBTQ+ rights', 'cultural diversity',
    'employee training', 'workplace diversity', 'employee well-being', 'community support',
    'social initiatives', 'social programs', 'philanthropy', 'donations', 'charity',
    'affordable housing', 'education support', 'healthcare access', 'local communities',
    'minority support', 'support for disabled', 'workforce diversity', 'social inclusion',
    'accessibility', 'elder care', 'youth development', 'poverty alleviation', 'social welfare',
    'employee benefits', 'social engagement', 'volunteerism', 'social impact investing'
]

governance_keywords = [
    'corporate governance', 'board diversity', 'board independence', 'board oversight',
    'shareholder rights', 'executive compensation', 'ethical behavior', 'transparency',
    'accountability', 'anti-corruption', 'anti-bribery', 'whistleblower protection',
    'risk management', 'compliance', 'code of conduct', 'business ethics',
    'corporate ethics', 'governance structure', 'regulatory compliance', 'internal controls',
    'audit committee', 'governance policies', 'board practices', 'governance frameworks',
    'stakeholder engagement', 'conflict of interest', 'corporate responsibility',
    'corporate integrity', 'governance initiatives', 'independent directors',
    'governance standards', 'governance codes', 'shareholder engagement', 'executive pay',
    'financial transparency', 'reporting standards', 'audit integrity', 'leadership ethics',
    'corporate accountability', 'ethical standards', 'compliance programs', 'governance reforms',
    'governance practices', 'corporate oversight', 'fiduciary duty', 'governance principles',
    'board effectiveness', 'executive oversight', 'regulatory standards', 'ethical governance'
]

def load_pdf_text(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def preprocess_text(text):
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    return filtered_words

def analyze_text(words, keywords):
    keyword_counts = {keyword: words.count(keyword) for keyword in keywords}
    return keyword_counts

def calculate_score(keyword_counts):
    total_keywords = sum(keyword_counts.values())
    unique_keywords = sum(1 for count in keyword_counts.values() if count > 0)
    score = total_keywords + unique_keywords * 5  # Adjust weighting as needed
    return score

def apply_count_logic(keyword_counts):
    adjusted_counts = {keyword: 1 if count > 0 else 0 for keyword, count in keyword_counts.items()}
    return adjusted_counts

def save_to_csv(keyword_counts, adjusted_counts, category, output_file):
    df = pd.DataFrame({
        'Keyword': list(keyword_counts.keys()),
        'Count': list(keyword_counts.values()),
        'Adjusted Count': list(adjusted_counts.values()),
        'Category': category
    })
    df.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)

def main(file_path, output_file):
    text = load_pdf_text(file_path)
    words = preprocess_text(text)
    
    environment_counts = analyze_text(words, environment_keywords)
    social_counts = analyze_text(words, social_keywords)
    governance_counts = analyze_text(words, governance_keywords)
    
    adjusted_environment_counts = apply_count_logic(environment_counts)
    adjusted_social_counts = apply_count_logic(social_counts)
    adjusted_governance_counts = apply_count_logic(governance_counts)
    
    print("Environment Keyword Counts:")
    for keyword, count in environment_counts.items():
        print(f"{keyword}: {count}, Count: {adjusted_environment_counts[keyword]}")
    
    print("\nSocial Keyword Counts:")
    for keyword, count in social_counts.items():
        print(f"{keyword}: {count}, Count: {adjusted_social_counts[keyword]}")
    
    print("\nGovernance Keyword Counts:")
    for keyword, count in governance_counts.items():
        print(f"{keyword}: {count}, Count: {adjusted_governance_counts[keyword]}")
    
    environment_score = calculate_score(environment_counts)
    social_score = calculate_score(social_counts)
    governance_score = calculate_score(governance_counts)
    
    final_score = (environment_score + social_score + governance_score) / 3
    print(f"\nEnvironment Score: {environment_score}")
    print(f"Social Score: {social_score}")
    print(f"Governance Score: {governance_score}")
    print(f"\nFinal ESG Score: {final_score}")
    
    save_to_csv(environment_counts, adjusted_environment_counts, 'Environment', output_file)
    save_to_csv(social_counts, adjusted_social_counts, 'Social', output_file)
    save_to_csv(governance_counts, adjusted_governance_counts, 'Governance', output_file)

# Specify the path to your PDF document and output CSV file
file_path = 'tata steel annual report.pdf'
output_file = 'ESG_keyword_counts_with_adjusted.csv'
main(file_path, output_file)


Environment Keyword Counts:
sustainable: 2, Count: 1
eco-friendly: 0, Count: 0
renewable: 2, Count: 1
carbon neutral: 0, Count: 0
organic: 0, Count: 0
biodegradable: 0, Count: 0
recycling: 0, Count: 0
energy-efficient: 0, Count: 0
green energy: 0, Count: 0
environmental impact: 0, Count: 0
solar: 7, Count: 1
wind: 0, Count: 0
hydropower: 0, Count: 0
clean energy: 0, Count: 0
compostable: 0, Count: 0
low carbon: 0, Count: 0
zero waste: 0, Count: 0
green building: 0, Count: 0
LEED certified: 0, Count: 0
carbon footprint: 0, Count: 0
eco-conscious: 0, Count: 0
sustainability: 1, Count: 1
upcycled: 0, Count: 0
green technology: 0, Count: 0
conservation: 2, Count: 1
water-saving: 0, Count: 0
eco-innovation: 0, Count: 0
green design: 0, Count: 0
eco-label: 0, Count: 0
green seal: 0, Count: 0
energy star: 0, Count: 0
greenhouse gases: 0, Count: 0
environmental stewardship: 0, Count: 0
eco-certification: 0, Count: 0
carbon offset: 0, Count: 0
natural resources: 0, Count: 0
pollution reduction: