In [1]:
# ✅ Import Required Libraries
import requests
import pandas as pd
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
import faiss
import json

In [2]:
# ✅ Load SentenceTransformer Model for FAISS Embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# ✅ Initialize FAISS Vector Store
dimension = embedding_model.get_sentence_embedding_dimension()  # Get vector size
index = faiss.IndexFlatL2(dimension)  # Create FAISS index

# ✅ Load ESGify Classification Model & Tokenizer
MODEL_NAME = "ai-lab/ESGify"
classification_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ✅ Initialize Dictionary to Store FAISS Mappings
stored_perceptions = {}

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at ai-lab/ESGify and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
#newsapi.io

NEWS_API_KEY = os.getenv("NEWS_API_KEY")

def fetch_newsdata(company_name, NEWS_API_KEY, max_results=10):
    """
    Fetches the top news articles related to ESG controversies for a given company using newsdata.io.
    """
    base_url = "https://newsdata.io/api/1/latest?"

    # Construct query to capture ESG controversies (e.g., controversy, emissions, sustainability)
    query = f"{company_name} ESG OR controversy OR emissions OR sustainability"
    
    # Define request parameters
    params = {
        "apikey": NEWS_API_KEY,
        "q": query,
        "language": "en",
    }
    
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        # Convert the results into a DataFrame for easy processing
        articles = data.get("results", [])
        return pd.DataFrame(articles)

        # ✅ Ensure 'description' exists
        if "description" not in df.columns:
            print(f"Warning: 'description' missing for {company_name}. Creating empty column.")
            df["description"] = None  # Create empty column to avoid KeyError
    else:
        print(f"Error fetching news: {response.status_code}")
        return None

In [4]:
# ✅ Function to Classify ESG Risks
def classify_news_batch(news_df):
    """Classifies each news article's description using the ESGify model."""
    
    if "description" not in news_df.columns:
        print("Skipping classification: No 'description' column in DataFrame.")
        news_df["esg_classification"] = None  # Ensure column exists
        return news_df  

    classifications = [None] * len(news_df)  # Pre-fill with None

    for idx, description in enumerate(news_df["description"]):
        if pd.isna(description):  # Handle missing values
            classifications[idx] = None  # Explicitly assign None
        else:
            inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = classification_model(**inputs)
            scores = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze().tolist()

            if hasattr(classification_model.config, "id2label"):
                ID2LABEL = classification_model.config.id2label  # Use model's label mapping
            else:
                print("Warning: No ID2LABEL mapping found. Defaulting to generic categories.")
                ID2LABEL = {i: f"Category {i}" for i in range(len(scores))}

            # ✅ Ensure length consistency
            top_indices = np.argsort(scores)[-3:][::-1]
            top_categories = [(ID2LABEL.get(i, f"Category {i}"), round(scores[i], 3)) for i in top_indices]

            classifications[idx] = top_categories  # Assign to the same index

    # ✅ Store classifications in DataFrame
    news_df["esg_classification"] = classifications
    return news_df

In [None]:
def compute_esg_perception(news_df):
    """
    Aggregates ESG classification scores across all news articles 
    and computes an overall perception score.
    """
    # Extract classifications from DataFrame
    all_risks = []
    all_scores = []

    for classifications in news_df["esg_classification"]:
        if classifications is not None:
            for category, score in classifications:
                all_risks.append(category)
                all_scores.append(score)

    if not all_scores:  # Handle empty scores
        return {
            "Average_Perception_Score": None,
            "Dominant_ESG_Risks": [],
            "Risk_Frequency": {}
        }

    # Compute average perception score
    avg_score = np.mean(all_scores)

    # Identify the most common ESG concerns
    risk_counts = Counter(all_risks)
    dominant_risks = risk_counts.most_common(3)  # Top 3 most frequent risks

    # Format results
    esg_summary = {
        "Average_Perception_Score": round(avg_score, 3),
        "Dominant_ESG_Risks": [risk[0] for risk in dominant_risks],
        "Risk_Frequency": dict(risk_counts)
    }

    return esg_summary

In [6]:
# ✅ Function to Store ESG Perceptions in FAISS
def store_esg_perception_in_faiss(company_name, esg_summary, index, stored_perceptions):
    """
    Converts ESG perception summary into embeddings, stores in FAISS, 
    and saves a mapping from FAISS index to actual ESG summary.
    """
    perception_text = f"""
    ESG Perception Summary for {company_name}:
    - Average ESG Risk Score: {esg_summary["Average_Perception_Score"]}
    - Dominant ESG Risks: {", ".join(esg_summary["Dominant_ESG_Risks"])}
    - Risk Breakdown: {esg_summary["Risk_Frequency"]}
    """

    # ✅ Use SentenceTransformer Model to Generate Embedding
    embedding = embedding_model.encode([perception_text])

    # ✅ Store Embedding in FAISS
    index.add(np.array(embedding))

    # ✅ Store ESG Summary in External Dictionary (Mapped to FAISS)
    stored_perceptions[len(stored_perceptions)] = perception_text

    return index


In [7]:
# ✅ Function to Retrieve ESG Perceptions from FAISS
def retrieve_esg_perception(query, index, embedding_model, stored_perceptions):
    """
    Retrieves the most relevant ESG perception summary from FAISS based on a query.
    """
    query_embedding = embedding_model.encode([query])
    _, indices = index.search(np.array(query_embedding), k=1)  # Retrieve top match

    retrieved_index = indices[0][0]  # Extract the best match

    if retrieved_index == -1 or retrieved_index not in stored_perceptions:
        return "No matching ESG perception found."

    return stored_perceptions[retrieved_index]  # Return the actual ESG perception data

In [8]:
# ✅ Function to Retrieve or Compute ESG Perception
def retrieve_or_compute_esg_perception(company_name, index, stored_perceptions):
    """Retrieves ESG perception from FAISS or computes and stores it if missing."""
    # ✅ Step 1: Encode the query and check if the company exists in FAISS
    query_embedding = embedding_model.encode([company_name])
    _, indices = index.search(np.array(query_embedding), k=1)
    retrieved_index = indices[0][0]  # Extract the best match

    # ✅ Step 2: If the company exists, return the stored perception
    if retrieved_index != -1 and retrieved_index in stored_perceptions:
        print(f"Retrieved {company_name}'s ESG perception from FAISS.")
        return stored_perceptions[retrieved_index]
    
    print(f"{company_name} not found in FAISS. Computing new ESG perception...")
    news_df = fetch_newsdata(company_name, NEWS_API_KEY)
    news_df = classify_news_batch(news_df)
    esg_summary = compute_esg_perception(news_df)
    
    return store_esg_perception_in_faiss(company_name, esg_summary, index, stored_perceptions)

In [13]:
def populate_vector_store(api_key, json_file):
    """Populates FAISS vector store with ESG perceptions for first 50 companies."""
    with open(json_file, "r") as file:
        company_data = json.load(file)
    
    first_50_companies = list(company_data.items())[:25]
    
    for _, company in first_50_companies:
        company_name = company["title"]
        print(f"Processing {company_name}...")
        news_df = fetch_newsdata(company_name, NEWS_API_KEY)
        news_df = classify_news_batch(news_df)
        esg_summary = compute_esg_perception(news_df)
        store_esg_perception_in_faiss(company_name, esg_summary, index, stored_perceptions)
    
    print("Vector store populated with first 50 companies.")

In [14]:
populate_vector_store(NEWS_API_KEY, "data/company_tickers.json")

Processing Apple Inc....
Processing MICROSOFT CORP...
Processing NVIDIA CORP...
Processing AMAZON COM INC...
Processing Alphabet Inc....
Processing Meta Platforms, Inc....
Processing BERKSHIRE HATHAWAY INC...
Processing Tesla, Inc....
Processing TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD...
Processing Broadcom Inc....
Processing ELI LILLY & Co...
Processing Walmart Inc....
Processing JPMORGAN CHASE & CO...
Processing VISA INC....
Processing SPDR S&P 500 ETF TRUST...
Processing Mastercard Inc...
Processing EXXON MOBIL CORP...
Processing COSTCO WHOLESALE CORP /NEW...
Processing ORACLE CORP...
Processing UNITEDHEALTH GROUP INC...
Processing NETFLIX INC...
Processing PROCTER & GAMBLE Co...
Processing JOHNSON & JOHNSON...
Processing NOVO NORDISK A S...
Processing HOME DEPOT, INC....
Vector store populated with first 50 companies.


In [11]:
print("Number of ESG perceptions stored in FAISS:", index.ntotal)

Number of ESG perceptions stored in FAISS: 2


In [19]:
company_name = "what are the ESG risks of EPIC?"
esg_summary = retrieve_or_compute_esg_perception(company_name, index, stored_perceptions)
esg_summary

Retrieved what are the ESG risks of EPIC?'s ESG perception from FAISS.


"\n    ESG Perception Summary for Mastercard Inc:\n    - Average ESG Risk Score: 0.023\n    - Dominant ESG Risks: Environmental Management, Animal Welfare, Natural Resources\n    - Risk Breakdown: {'Environmental Management': 1, 'Animal Welfare': 1, 'Natural Resources': 1, 'Supply Chain (Social)': 1, 'Landscape Transformation': 1, 'Disclosure': 1}\n    "

In [1]:
from news import ESGNewsProcessor


In [2]:
import os
news_classifier = ESGNewsProcessor()
news_classifier.populate_vector_store("data/company_tickers.json")

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at ai-lab/ESGify and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded existing FAISS vector store.
Loaded stored perceptions.
Processing COCA COLA CO...
Stored COCA COLA CO ESG perception and updated FAISS vector store.
Processing REELCAUSE INC...
Skipping REELCAUSE INC: No news data found.
Processing ASML HOLDING NV...
Skipping ASML HOLDING NV: No news data found.
Processing Salesforce, Inc....
Stored Salesforce, Inc. ESG perception and updated FAISS vector store.
Processing CHEVRON CORP...
Stored CHEVRON CORP ESG perception and updated FAISS vector store.
Processing CISCO SYSTEMS, INC....
Stored CISCO SYSTEMS, INC. ESG perception and updated FAISS vector store.
Processing WELLS FARGO & COMPANY/MN...
Stored WELLS FARGO & COMPANY/MN ESG perception and updated FAISS vector store.
Processing Philip Morris International Inc....
Stored Philip Morris International Inc. ESG perception and updated FAISS vector store.
Processing ABBOTT LABORATORIES...
Stored ABBOTT LABORATORIES ESG perception and updated FAISS vector store.
Processing TOYOTA MOTOR CORP/..

In [6]:
news_classifier = ESGNewsProcessor()
news_classifier.retrieve_or_compute_esg_perception("what are the ESG risks of Alphabet?")

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at ai-lab/ESGify and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded existing FAISS vector store.
Loaded stored perceptions.
Retrieved what are the ESG risks of Alphabet?'s ESG perception from FAISS.


"\n        ESG Perception Summary for BERKSHIRE HATHAWAY INC:\n        - Average ESG Risk Score: 0.023\n        - Dominant ESG Risks: Responsible Investment & Greenwashing, Energy Efficiency and Renewables, Freedom of Association and Right to Organise\n        - Risk Breakdown: {'Responsible Investment & Greenwashing': 10, 'Energy Efficiency and Renewables': 8, 'Freedom of Association and Right to Organise': 6, 'Labor Relations Management': 2, 'Planning Limitations': 1, 'Hazardous Materials Management': 2, 'Discrimination': 1}\n        "