In [15]:
import glob
import os

def get_file_paths(directory):
    search_pattern = os.path.join(directory, '**', '*')
    file_paths = glob.glob(search_pattern, recursive=True)
    file_paths = [path for path in file_paths if os.path.isfile(path)]
    return file_paths

allfiles = get_file_paths('/home/Malaysia 1/KL Policies and Plans')
allfiles.extend(get_file_paths("/home/Malaysia 1/National Policies"))
allfiles

['/home/Malaysia 1/KL Policies and Plans/DBKLCMP/DBKL Carbon Management Plan.pdf',
 '/home/Malaysia 1/KL Policies and Plans/DBKLCMP/DBKL Carbon Management Plan.xlsx',
 '/home/Malaysia 1/KL Policies and Plans/KLCAP2050/C40_KLCAP2050_Release 2021.pdf',
 '/home/Malaysia 1/KL Policies and Plans/KLCAP2050/C40_KLCAP2050_viewing-only-MR-single.pdf',
 '/home/Malaysia 1/KL Policies and Plans/KLCAP2050/KL2050 plans.docx',
 '/home/Malaysia 1/KL Policies and Plans/KLCAP2050/KL2050 quantitative.docx',
 '/home/Malaysia 1/KL Policies and Plans/KLCAP2050/KL2050 spreadsheet info.docx',
 '/home/Malaysia 1/KL Policies and Plans/KLCAP2050/KLCAP2050 - doc to pull from.docx',
 '/home/Malaysia 1/KL Policies and Plans/KLCAP2050/Kuala Lumpur Climate Action Plan 2050 notes.docx',
 '/home/Malaysia 1/KL Policies and Plans/KLLCSBP2030/KL LCS BP 2030 - doc to pull from.docx',
 '/home/Malaysia 1/KL Policies and Plans/KLLCSBP2030/KL LCS BP 2030 - full.docx',
 '/home/Malaysia 1/KL Policies and Plans/KLLCSBP2030/KL LCS

In [4]:
from langchain.document_loaders import PyPDFLoader, CSVLoader, UnstructuredWordDocumentLoader
from langchain.schema import Document
import pandas as pd


# parse .xlxs files 
def load_excel_as_documents(file_path: str):
    dfs = pd.read_excel(file_path, sheet_name=None) # dict: {sheet_name: DataFrame}
    docs = []
    for sheet_name, df in dfs.items():
        text = df.to_csv(index=False)
        metadata = {"source": file_path, "sheet_name": sheet_name}
        docs.append(Document(page_content=text, metadata=metadata))
    return docs


# parse all the files and store the `Document` objects returned in a list
all_documents = []
for file_path in allfiles:
    ext = os.path.splitext(file_path)[1].lower()
    docs = []
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
        docs = loader.load()
    elif ext == ".docx":
        loader = UnstructuredWordDocumentLoader(file_path)
        docs = loader.load()
    elif ext == ".csv":
        csv_loader = CSVLoader(file_path=file_path)
        docs = csv_loader.load()
    elif ext == ".xlsx":
        docs = load_excel_as_documents(file_path)
    else:
        print("missed")
    all_documents.extend(docs)
print(f"Loaded {len(all_documents)} chunks of Documents from {len(allfiles)} files.")

Loaded 2195 chunks of Documents from 52 files.


In [5]:
from langchain.text_splitter import CharacterTextSplitter

# separate a given text into 2000 sized chunks where each chunkoverlaps by 100
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=2000,
    chunk_overlap=100
)

# split the files into chunks and store all the chunks as `Document` objects 
split_docs = []
for doc in all_documents:
    chunks = text_splitter.split_text(doc.page_content)
    for chunk in chunks:
        split_docs.append(
            Document(page_content=chunk, metadata=doc.metadata)
        )

print(f"Total chunks after splitting: {len(split_docs)}")

Created a chunk of size 2512, which is longer than the specified 2000
Created a chunk of size 2512, which is longer than the specified 2000
Created a chunk of size 2512, which is longer than the specified 2000


Total chunks after splitting: 3478


In [7]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from time import sleep


# make the embedding function with the embedding model
embedding_function = OpenAIEmbeddings(model="text-embedding-3-small-birthright", openai_api_key="sk-KbR-Csnj0L7vUssXqZJQew", 
                                      openai_api_base="https://ai-incubator-api.pnnl.gov")
# make a vector database to store all the embeddings 
vectorstore = Chroma(
    collection_name="my_collection",
    embedding_function=embedding_function,
    persist_directory="./chroma_db"
)

# add 20 documents at a time to the database, each document will first be converted into embeddings 
# by the embedding model and then added to the database. 
batch_size = 20
for i in range(0, len(split_docs), batch_size):
    batch = split_docs[i : i + batch_size]
    vectorstore.add_documents(batch)
    # sleep to prevent too many requests error from the embedding model server 
    sleep(3)

In [8]:
# save the database

In [9]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# make retriever object which would pick "k" (in this case 10) chunks for each query to the llm model
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
# make the question-answer chain object
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model_name="gpt-4o-birthright", temperature=0, openai_api_key="sk-KbR-Csnj0L7vUssXqZJQew", 
                                      openai_api_base="https://ai-incubator-api.pnnl.gov"),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

question = "What are the policies and targets related to conventional coal phase-out in electricity generation in Malaysia? What, specifically, will they be if extrapolated to 2050 with reasonable assumptions?"
result = qa_chain.invoke(question)

print("Answer:\n", result["result"])
print("\nSources used:")
for doc in result["source_documents"]:
    print(doc.metadata)

Answer:
 The policies and targets related to conventional coal phase-out in electricity generation in Malaysia are primarily outlined in the New Capacity Target scenario. This scenario advocates for no new coal plants post-2025 and aims to accelerate Malaysia's energy transition by replacing some retiring coal plants with renewable energy (RE) and gas-fired capacities. By 2035, coal's contribution to the capacity mix is expected to sharply decline from 37% in 2020 to 18%.

Extrapolating these targets to 2050 with reasonable assumptions would likely involve further reductions in coal usage, potentially phasing it out entirely, as Malaysia continues to increase its reliance on renewable energy sources and natural gas. The focus would be on achieving a higher share of RE in the capacity mix, further reducing dependency on coal, and aligning with global decarbonization goals. However, specific targets for 2050 are not detailed in the provided context, so these assumptions are based on the 

In [10]:
question = "Can I still get specific estimates? Just have general assumptions."
result = qa_chain.invoke(question)

print("Answer:\n", result["result"])

Answer:
 I don't know.


In [None]:
# document
# converting ai output to gcam input
# malaysia data
# test a parameter
# check which could work and not working
# ability to extrapolate
# document everything

# track issues and hours and priorities

# maridee: if she extrapolated
# iea data