In [1]:
import os
import glob
import tempfile

import numpy as np
import pandas as pd
import streamlit as st
# from streamlit_chat import message
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
# from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import FAISS

from langchain.indexes import VectorstoreIndexCreator
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain 
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.chains import RetrievalQA



In [2]:
data_dir = "/Users/d3y010/repos/crvernon/archive/data/documents"
db_dir = "/Users/d3y010/repos/crvernon/archive/data/archive"

# get a list of PDF files
pdf_files = glob.glob(os.path.join(data_dir, "*.pdf"))


In [3]:
loader = DirectoryLoader(data_dir, glob="*.pdf", loader_cls=PyPDFLoader)


In [4]:
documents = loader.load()


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)


In [6]:
texts = text_splitter.split_documents(documents)


In [7]:
embeddings = OpenAIEmbeddings()


In [8]:
db = Chroma.from_documents(texts, 
                            embedding=embeddings,
                            persist_directory=db_dir)

db.persist()


In [None]:
# # load from disk once created 
# db = Chroma(embedding=embeddings,
#             persist_directory=".")


In [3]:
# target_pdf = pdf_files[1]


In [4]:
# target_pdf

'/Users/d3y010/projects/gcims/gcam_advisor/data/documents/1-s2.0-S0301421517304469-main.pdf'

In [5]:
# loader = PyPDFLoader(target_pdf)


In [6]:
# documents = loader.load()


In [7]:
# split the documents into chunks
# text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# texts = text_splitter.split_documents(documents)


In [8]:
# embeddings = OpenAIEmbeddings()


In [13]:
retriever = db.as_retriever(
    search_type="similarity",
    # search_type="mmr",
    search_kwargs={"k": 4}
)

                            

In [14]:
docs = retriever.get_relevant_documents("What is GCAM?")

len(docs)


4

In [24]:
d = {"source": [], "page_number": [], "text_excerpt": []}
for i in docs:
    extracted_text = i.page_content.replace("\n", "")

    text_excerpt = f"...{extracted_text}..."

    metadata = i.metadata
    
    d["source"].append(os.path.basename(metadata["source"]))
    d["page_number"].append(metadata["page"])
    d["text_excerpt"].append(text_excerpt)
    
df = pd.DataFrame(d)

df


Unnamed: 0,source,page_number,text_excerpt
0,gmd-12-677-2019.pdf,0,...decades into the future. GCAM has its roots...
1,gmd-12-677-2019.pdf,18,...All code and inputs are available at https:...
2,gmd-12-677-2019.pdf,0,...Revised: 6 January 2019 – Accepted: 17 Janu...
3,gmd-12-677-2019.pdf,17,...important for informing both our scientiﬁc ...


In [25]:
docs[2]

Document(page_content='Revised: 6 January 2019 – Accepted: 17 January 2019 – Published: 15 February 2019\nAbstract. This paper describes GCAM v5.1, an open source\nmodel that represents the linkages between energy, water,\nland, climate, and economic systems. GCAM is a market\nequilibrium model, is global in scope, and operates from\n1990 to 2100 in 5-year time steps. It can be used to examine,\nfor example, how changes in population, income, or tech-\nnology cost might alter crop production, energy demand, or', metadata={'page': 0, 'source': '/Users/d3y010/repos/crvernon/archive/data/documents/gmd-12-677-2019.pdf'})

In [None]:
metadata = docs[0].metadata

page_number = f"Extracted from page {metadata["page"]}."
source_document = metadata["source"]


In [34]:
qa_chain = RetrievalQA.from_llm(
    llm=ChatOpenAI(
        temperature=0.0,
        model_name="gpt-4",
        max_tokens=500
    ),
    retriever=retriever,
    return_source_documents=True
)


In [47]:
response = qa_chain("Has any modeling been conducted in India with GCAM?")


source_docs = []
for source in response["source_documents"]:
    source_docs.append(os.path.basename(source.metadata["source"]))
    
print("Response:")
print(response["result"])

print("\nSources:")
for i in np.unique(source_docs):
    print(i)


Response:
Yes, modeling has been conducted in India with GCAM. The model has been extended to create GCAM-India and GCAM-Gujarat, which are global integrated assessment models with additional details for India and Gujarat. The GCAM-India model includes a detailed India building energy model.

Sources:
1-s2.0-S0301421517304469-main.pdf
908307.pdf
Yu_2018_Environ._Res._Lett._13_034034.pdf


In [28]:
l = ["c", "a", "G", "b"]
for i in reversed(l):
    print(i)

b
G
a
c


In [11]:
chain = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(
        temperature=0.0,
        model_name="gpt-4",
        max_tokens=500
    ),
    retriever,
)


In [12]:
chat_history = []


In [14]:
query = "What is GCAM?"


In [15]:
result = chain(
    {
    "question": query,
    "chat_history": chat_history
    },
    return_only_outputs=True
)

chat_history.append((query, result["answer"]))

result["answer"]


'GCAM (Global Change Assessment Model) is a model designed to answer "what if" questions about the future. It helps us understand how the future will evolve under a particular set of conditions and how the system will change under the influence of external factors. GCAM represents five different interacting and interconnected systems: energy, water, land, socioeconomics, and climate. It allows users to examine the influence of changes in socioeconomics or policy on energy, water, and land. The model can also be used to explore the implications of changes in one region on other regions.'