In [1]:

import os
from dotenv import load_dotenv
from pathlib import Path


In [2]:
# Load environment variables
dotenv_path = Path('../.env')

if not load_dotenv(dotenv_path):
    print("Could not load .env file or it is empty. Please check if it exists and is readable.")
    exit(1)


OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")


# Load Documents

In [4]:
from langchain.document_loaders import PyPDFLoader # also look into UnstructuredPDFLoader

In [5]:
loader = PyPDFLoader("../data/FY2223-Q3-JFM-8-K-Final.pdf")

# Load the document by calling loader.load()
pages = loader.load()

print(len(pages))
print(pages[0].page_content[0:500])

print(pages[0].metadata)

15
News Release The Procter & Gamble Company
One P&G Plaza
Cincinnati, OH 45202
P&G ANNOUNCES FISCAL YEAR 2023  THIRD QUARTER  RESULTS
Net Sales +4%; Organic Sales +7%
Diluted EPS and Core EPS $1.37, each +3%
RAISES SALES GROWTH AND CASH RETURN GUIDANCE
MAINTAINS FISCAL YEAR EPS GROWTH GUIDANCE RANGE
 
CINCINNATI, April 21, 2023  - The Procter & Gamble Company (NYSE:PG) reported third 
quarter  fiscal year 2023 net sales of $20.1 billion, an increase of four percent versus the prior year. 
Organic 
{'source': '../data/FY2223-Q3-JFM-8-K-Final.pdf', 'page': 0}


In [6]:
#To load multiple PDFs (Source: https://www.youtube.com/watch?v=wrD-fZvT6UI&ab_channel=PromptEngineering)
# data_file_path = "../data" 
# os.listdir(data_file_path) # TODO: maybe use glob or FilePath instead?

# loaders = [PyPDFLoader(os.path.join(data_file_path, filename)) for filename in os.listdir(data_file_path)]
# pages = []
# for loader in loaders:
#     pages.extend(loader.load())

In [7]:
# To load in directories with multiple file types
# # source: https://www.linkedin.com/pulse/build-qa-bot-over-private-data-openai-langchain-leo-wang
# from langchain.document_loaders import DirectoryLoader

# pdf_loader = DirectoryLoader('./Reports/', glob="**/*.pdf")
# txt_loader = DirectoryLoader('./Reports/', glob="**/*.txt")
# word_loader = DirectoryLoader('./Reports/', glob="**/*.docx")

# loaders = [pdf_loader, txt_loader, word_loader]
# documents = []
# for loader in loaders:
#     documents.extend(loader.load())

# print(f"Total number of documents: {len(documents)}")

# Chunk Documents

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [9]:
# text_splitter = CharacterTextSplitter(separator='\n') #CharacterTextSplitter does not use chunk_size and chunk_overlap params
# texts = text_splitter.split_documents(pages)

In [10]:
# Define the Text Splitter 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 150,
    chunk_overlap = 15,
    separators=["\n\n", "\n", " ", ""] #adjust these as necessary
)

#Create a split of the document using the text splitter
texts = text_splitter.split_documents(pages)

In [11]:
# Splitting by tokens requires tiktoken, which is OpenAI
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(pages)

In [12]:
len(pages)

15

In [13]:
len(texts)

85

# Embedding

## LlamaCpp Embeddings

In [14]:
from langchain.embeddings import LlamaCppEmbeddings

In [15]:
# embeddings = LlamaCppEmbeddings(model_path="../models/llama-2-7b-chat.Q4_K_M.gguf")

In [16]:
_texts = []
for i in range(len(texts)):
    _texts.append(texts[i].page_content)

In [17]:
# embedded_texts = embeddings.embed_documents(_texts) # Don't really need this code anymore. Most vector stores embed docs for you
# len(embedded_texts), len(embedded_texts[0])

## OpenAI Embeddings

In [18]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [19]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

## HuggingFace Embeddings

In [20]:
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS

# model_name = "sentence-transformers/all-mpnet-base-v2"
# model_kwargs = {"device": "cuda"}

# embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs) #TODO: Look into cache_folder param for caching embeddings

# # storing embeddings in the vector store
# vectorstore = FAISS.from_documents(texts, embeddings)

# Vector Store

In [21]:
from langchain.vectorstores import Chroma
# Also look into langchain.indexes import VectorstoreIndexCreator

In [22]:
# persist_directory = 'docs/chroma/' # Option to save vector store in a persistent directory so that we can use it in future.
db = Chroma.from_documents(documents=texts, embedding=embeddings)#, persist_directory=persist_directory)
# db.persist()

# Retrieval

In [40]:
from langchain.llms import OpenAI #No longer supported, use ChatOpenAI instead
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

# llm = OpenAI(model_name="gpt-3.5-turbo")
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
chain = ConversationalRetrievalChain.from_llm(llm, db.as_retriever(), return_source_documents=True)

response = chain({"question": "summarize the information", "chat_history": []})

In [41]:
response

{'question': 'summarize the information',
 'chat_history': [],
 'answer': 'The information provided discusses various risks and challenges that a company may face in its operations. These include risks associated with third-party relationships, information and technology systems, political conditions, competition, customer relationships, reputation and brand equity, financial and legal matters, and costs of labor and resources. The company also emphasizes the importance of innovation, intellectual property protection, and staying competitive in sales channels.',
 'source_documents': [Document(page_content='ational and operational risk \nassociated with third-party relationships, such as our suppliers, contract manufacturers, distributors, \ncontractors and external business partners; (11) the ability to rely on and maintain key company and third-\nparty information and operational technology systems, networks and services and maintain the security \nand functionality of such systems, n

# Scratch Work: Calling LLMs

In [23]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(model_path="../models/llama-2-7b-chat.Q4_K_M.gguf", verbose=False, n_ctx=4048, temperature=0)
llm.client.verbose = False

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,     

In [25]:
response = llm("What is batman's real name")

response = llm("""Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know.
Don't try to make up an answer.

Context: Batman[a] is a superhero appearing in American comic books published by DC Comics. The character was created by artist Bob Kane and writer Bill Finger, and debuted in the 27th issue of the comic book Detective Comics on March 30, 1939. In the DC Universe continuity, Batman is the alias of Bruce Wayne, a wealthy American playboy, philanthropist, and industrialist who resides in Gotham City. Batman's origin story features him swearing vengeance against criminals after witnessing the murder of his parents Thomas and
        DC Comics concept art of Bruce Wayne by Mikel Janín Batman's secret identity is Bruce Wayne, a wealthy American industrialist. As a child, Bruce witnessed the murder of his parents, Dr. Thomas Wayne and Martha Wayne, which ultimately led him to craft the Batman persona and seek justice against criminals. He resides on the outskirts of Gotham City in his personal residence, Wayne Manor. Wayne averts suspicion by acting the part of a superficial playboy idly living off his family's fortune and the               

Question: What is batman's real name
Answer: 
""")

print(response)

I don't know


In [32]:
from langchain.llms import CTransformers

config = {'max_new_tokens': 256, 'temperature': 0.3}
llm = CTransformers(model='../models/llama-2-7b-chat.ggmlv3.q8_0.bin',
                    model_type='llama', config=config)

In [33]:
response = llm("What is batman's real name")

response = llm("""Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know.
Don't try to make up an answer.

Context: 
time recurring foes that are part of Batman's rogues gallery include Catwoman (a cat burglar anti-heroine who is variously an ally and romantic interest), the Penguin, Ra's al Ghul, Two-Face, the Riddler, the Scarecrow, Mr. Freeze, Poison Ivy, Harley Quinn, Bane, Clayface, and Killer Croc, among others. Many of Batman's adversaries are often psychiatric patients at Arkham Asylum. Allies Alfred Main article: Alfred Pennyworth
time recurring foes that are part of Batman's rogues gallery include Catwoman (a cat burglar anti-heroine who is variously an ally and romantic interest), the Penguin, Ra's al Ghul, Two-Face, the Riddler, the Scarecrow, Mr. Freeze, Poison Ivy, Harley Quinn, Bane, Clayface, and Killer Croc, among others. Many of Batman's adversaries are often psychiatric patients at Arkham Asylum. Allies Alfred Main article: Alfred Pennyworth
                
Question: What is batman's real name
Answer: 
""")

print(response)

(Type "Bruce Wayne" or "Thomas Wayne")


In [42]:
# This is the direct OpenAI API, but better to use langchain wrapper
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Who played Batman in The Dark Knight? Only provide the first and last name"}
    ],
    max_tokens = 50
)

response['choices'][0]['message']['content']

'Christian Bale'

In [44]:
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
# llm = OpenAI(model_name="gpt-3.5-turbo")

question = "Which language is used to create chatgpt?"
print(question, llm.predict(question))


Which language is used to create chatgpt ? ChatGPT is primarily developed using the Python programming language. The underlying model is built using the PyTorch deep learning framework.


## Scratch: Prompting

In [25]:
from langchain import PromptTemplate

In [None]:
template = "Tell me a {adjective} joke."

prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['adjective'], output_parser=None, partial_variables={}, template='Tell me a {adjective} joke.', template_format='f-string', validate_template=True)

In [None]:
formatted_prompt = prompt.format(adjective="animal")
formatted_prompt

'Tell me a animal joke.'

In [None]:
response = llm(prompt=formatted_prompt)
print(response)

TypeError: BaseChatModel.__call__() missing 1 required positional argument: 'messages'

## Scratch: Chaining

In [28]:
from langchain.chains import LLMChain

In [45]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [46]:
print(llm_chain.run("goat"))

ValueError: A single string input was passed in, but this chain expects multiple inputs ({'context', 'question'}). When a chain expects multiple inputs, please call it by passing in a dictionary, eg `chain({'foo': 1, 'bar': 2})`

# Putting it all together
Tutorial from [here](https://ai.plainenglish.io/%EF%B8%8F-langchain-streamlit-llama-bringing-conversational-ai-to-your-local-machine-a1736252b172)

In [30]:
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know.
Don't try to make up an answer.
{context}
Question: {question}
Answer: 
"""

prompt = PromptTemplate.from_template(template)

query = "What was P&G's net sales"
query = "Who are the enemies of Batman"
docs = db.similarity_search(query=query, k=3)

context = docs[0].page_content

query_llm = LLMChain(llm=llm, prompt=prompt) #TODO: also look into langchain.chains.question_answering import load_qa_chain, and RetrievalQA chain
response = query_llm.run({"context": context, "question": query})
print(response)



The enemies of Batman include Catwoman, the Penguin, Ra's al Ghul, Two-Face, the Riddler, the Scarecrow, Mr. Freeze, Poison Ivy, Harley Quinn, Bane, Clayface, and Killer Croc, among others.


In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import TokenTextSplitter
from langchain import PromptTemplate
from langchain.callbacks import get_openai_callback
import os
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY #don't think this is needed 

# Define the LLM
# llm = ChatOpenAI(temperature=0,model_name="gpt-4")
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

# Data Ingestion
loader = PyPDFLoader("../data/FY2223-Q3-JFM-8-K-Final.pdf")
# loader = PyPDFLoader("../data/Batman_wiki.pdf")
pages = loader.load()

# Chunk and Embeddings
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(pages)

embeddings = OpenAIEmbeddings()

# Vector Store
db = Chroma.from_documents(documents=texts, embedding=embeddings)

# Create Prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know.
Don't try to make up an answer.
{context}

Question: {question}
Answer: 
"""

prompt = PromptTemplate.from_template(template)

# Initialise RetrievalQA Chain
chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(),#search_type="mmr"),#search_kwargs={"k":3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
    # persona="Shakespeare"
)



In [5]:
question = "What net sales in healthcare?"# of $20.1 billion net sales?"
# question = "Who are the enemies of Batman"

with get_openai_callback() as cb: # Token usage only possible for OpenAI API
    response = chain({"query": question, "persona": "Shakespeare"})

print(response)
print(cb)

{'query': 'What net sales in healthcare?', 'persona': 'Shakespeare', 'result': 'The net sales in healthcare are $2,828.', 'source_documents': [Document(page_content='494 3% $763 (4)% $608 (6)%\nGrooming 1,495 1% 382 8% 308 6%\nHealth Care 2,828 6% 667 7% 523 8%\nFabric & Home Care 7,016 5% 1,538 21% 1,174 21%\nBaby, Feminine & Family Care 5,062 3% 1,206 11% 925 11%\nCorporate 173 N/A (268', metadata={'page': 8, 'source': '../data/FY2223-Q3-JFM-8-K-Final.pdf'}), Document(page_content='494 3% $763 (4)% $608 (6)%\nGrooming 1,495 1% 382 8% 308 6%\nHealth Care 2,828 6% 667 7% 523 8%\nFabric & Home Care 7,016 5% 1,538 21% 1,174 21%\nBaby, Feminine & Family Care 5,062 3% 1,206 11% 925 11%\nCorporate 173 N/A (268', metadata={'page': 8, 'source': '../data/FY2223-Q3-JFM-8-K-Final.pdf'}), Document(page_content='6)% 10% (2)% —% 1%\nHealth Care 1% 1% (3)% 6% 3% (1)% 6%\nFabric & Home Care (5)% (5)% (4)% 13% 1% —% 5%\nBaby, Feminine & Family Care (4)% (4)% (3)% 8% 2% —% 3%\nTotal Company (3)% (3)% (

In [6]:
print(response['result'])

The net sales in healthcare are $2,828.


In [16]:
response['source_documents'][0]

Document(page_content='494 3% $763 (4)% $608 (6)%\nGrooming 1,495 1% 382 8% 308 6%\nHealth Care 2,828 6% 667 7% 523 8%\nFabric & Home Care 7,016 5% 1,538 21% 1,174 21%\nBaby, Feminine & Family Care 5,062 3% 1,206 11% 925 11%\nCorporate 173 N/A (268', metadata={'page': 8, 'source': '../data/FY2223-Q3-JFM-8-K-Final.pdf'})