In [1]:
# --- Project modules ---
import metrics3_utilities as m3
import kadoa_functions as kadoa
import esg_ai_pipeline_functions as esg

# --- Environment / config ---
from dotenv import load_dotenv
from pathlib import Path

# --- Standard library ---
import os
import sys
import csv
import json
import shutil
import time
import re
import logging
import traceback
from uuid import uuid4
from datetime import datetime
from pathlib import Path
from pprint import pprint
from urllib.parse import urlparse, parse_qs
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- Third-party libraries ---
import pandas as pd
import requests as req
from bs4 import BeautifulSoup
import boto3
import psycopg2
import psycopg2.extras
from flask import request, jsonify, Blueprint
from pypdf import PdfReader
import tiktoken

# --- AI clients ---
from openai import OpenAI

# Gemini SDKs (keep both â€” pipeline uses google.generativeai)
from google import genai          # new SDK (unused but harmless)
from google.genai import types    # new SDK types (unused but harmless)
#import google.generativeai as genai  # OLD SDK used by pipeline

# --- Hot reload for notebook dev ---
import importlib
importlib.reload(esg)
importlib.reload(kadoa)
importlib.reload(m3)

<module 'metrics3_utilities' from 'c:\\Users\\FernandoMaldondoTheD\\FernandoGit\\rag-fernando-testing\\metrics3_utilities.py'>

In [2]:
# most of the functions you will need are in the esg_ai_pipeline_functions.py file. 
# if you need to add a new function, do NOT add it to the esg_ai_pipeline_functions.py file. Instead, please create a new file and label it as "new_functions.py" or something clear. 

In [3]:
# VARIABLE SET UP 

# Connect to RDS for questions
#conn = m3.rds_start_connection("dev", secrets = None)

# get all questions from the database. SurveyID 3333 will give you 120 questions, all looking for explicit disclosures. SurveyID 1111 will give you 23 questions, more general in nature. Use 3333 for L3. 
#questions = esg.rds_get_prompts(conn, survey_id=3333)
# print(questions)

# the following code lets you pick and choose which questions you want to run. 
# questions = questions[:5]
# pprint(questions)

# these are system instructions for the AI. Do NOT change. 
instructions = esg.instructions
# print(instructions)


In [4]:
# Read keys from .env
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ORG_ID = os.getenv("OPENAI_ORG_ID")
OPENAI_PROJECT_ID = os.getenv("OPENAI_PROJECT_ID")

# Set the API key for GEMINI
genai_client = genai.Client(api_key=GEMINI_API_KEY)

# Set variables for the OpenAI API
open_ai_client = OpenAI(
    api_key=OPENAI_API_KEY,
    organization=OPENAI_ORG_ID,
    project=OPENAI_PROJECT_ID
)

In [5]:
###### PROCESS ALL COMPANIES IN "TESTING COMPANIES" FOLDER
base_path = Path.cwd() / "Testing Companies"


# use this to do specific companies. 
do_companies = ["test2"]


for company in do_companies:
    print("RUNNING LOCAL RAG FOR COMPANY: ", company)

    folder_path = os.path.join(base_path, company)

    # if you want to use the chunking and embedding approach, use the following code. if not, comment it out and change use_chunking=False in the function call.

    txt_folder = folder_path + "_txt"
    # Extract text from PDFs and save as .txt files
    txt_files = esg.extract_pdfs_to_txt_files(
        pdf_folder_path=folder_path,  # Folder containing PDFs
        output_folder_path=txt_folder  # If None, saves .txt files in same folder as PDFs
    )

    if txt_folder:
        folder_path = txt_folder
    '''
    # Then use the txt folder for chunking
    esg.ai_run_and_save_scores_locally(
        open_ai_client,
        genai_client,
        folder_path,  # Use the folder with .txt files
        company, 
        instructions,
        questions,
        use_chunking=True
    )
    '''

RUNNING LOCAL RAG FOR COMPANY:  test2
âœ… Extracted text from cnx.pdf â†’ cnx.txt
âœ… Extracted text from iqvia.pdf â†’ iqvia.txt
âœ… Extracted text from jet2plc.pdf â†’ jet2plc.txt
âœ… Extracted text from mindgym.pdf â†’ mindgym.txt

ðŸ“„ Created 4 .txt files in c:\Users\FernandoMaldondoTheD\FernandoGit\rag-fernando-testing\Testing Companies\test2_txt
