In [130]:
# pip install -r requirements.txt

### Using LangChain basics

You can use prompt templates to insert data into your prompt easily. Don't forget to create environment variables for your OpenAI or Azure OpenAI keys.

In [1]:
from langchain.prompts import PromptTemplate
from langchain.llms import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage
import os
from dotenv import load_dotenv
load_dotenv()
#  https://learn.microsoft.com/en-us/azure/ai-services/openai/reference

True

### Getting Started

In [2]:
# Using completions API
llm = AzureChatOpenAI(deployment_name='gpt-35-turbo',temperature=0)

# invoke llm
llm(
    [
        HumanMessage(
            content='Tell me a joke and output the result in json.'
        )
    ]
)

AIMessage(content='{\n  "joke": "Why don\'t scientists trust atoms? Because they make up everything!"\n}', additional_kwargs={}, example=False)

### Using LangChain to get structured data outputs

You can use LangChain to get JSON responses. Below I am wanting my question and answer in a structured format. But you can build whatever models you want.

In [133]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator

In [134]:
# I am chosing Q&A here, but really you can chose any structure you need for your class. 
# It knows how to generate the class structure based on what you put in description - magic?
class QnA(BaseModel):
    question: str = Field(description="question")
    answer: str = Field(description="answer")
    # word: str = Field(description="the first word of the question")

In [135]:
# You can use different parsers, or even construct your own. Pydantic creates nice objects for python so I am using that.
parser = PydanticOutputParser(pydantic_object=QnA)
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

In [136]:
# Format the prompt
actor_query = "What is a good name for a company that makes colorful socks?"
_input = prompt.format_prompt(query=actor_query)
print(_input)


text='Answer the user query.\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"question": {"title": "Question", "description": "question", "type": "string"}, "answer": {"title": "Answer", "description": "answer", "type": "string"}}, "required": ["question", "answer"]}\n```\nWhat is a good name for a company that makes colorful socks?\n'


In [137]:
# invoke the llm using the prompt template and formatter
output = llm([HumanMessage(content=_input.to_string())])
print(output.content)

{"question": "What is a good name for a company that makes colorful socks?", "answer": "Rainbow Threads"}


### The following demonstrates how to use Conversation History and Memory in a conversation chain

LangChain can help remember what the user is saying for back and forth chatgpt like capabilities

In [138]:
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory


# llm = AzureChatOpenAI(deployment_name='gpt-35-turbo',temperature=0)

# The conversation object manages the back and forward conversation, including memory.
conversation = ConversationChain(
    llm=llm, 
    verbose=True, 
    memory=ConversationBufferMemory()
)

# We just add user input and let the conversation object handle the rest.
conversation.predict(input="What does Microsoft do?")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: What does Microsoft do?
AI:[0m

[1m> Finished chain.[0m


'Microsoft is a multinational technology company that develops, manufactures, licenses, supports, and sells computer software, consumer electronics, personal computers, and related services. They are best known for their operating system, Microsoft Windows, which is used by the majority of personal computers worldwide. Microsoft also offers a wide range of other software products, including the Microsoft Office suite, which includes popular applications like Word, Excel, and PowerPoint. Additionally, they provide cloud services through their Azure platform, develop video game consoles like the Xbox, and have a presence in the hardware market with products like the Surface line of tablets and laptops.'

In [139]:
# Adding more user input to the conversation chain keeps the memory, see the verbose output below.
conversation.predict(input="How can they partner together with NASA?")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: What does Microsoft do?
AI: Microsoft is a multinational technology company that develops, manufactures, licenses, supports, and sells computer software, consumer electronics, personal computers, and related services. They are best known for their operating system, Microsoft Windows, which is used by the majority of personal computers worldwide. Microsoft also offers a wide range of other software products, including the Microsoft Office suite, which includes popular applications like Word, Excel, and PowerPoint. Additionally, they provide cloud services through their Azure platform, develop video game consoles like the Xbox, and have a pres

"Microsoft has a long-standing partnership with NASA. They collaborate on various projects and initiatives that involve the use of technology in space exploration and research. One notable example is the use of Microsoft's HoloLens mixed reality headset in NASA's Project Sidekick, which aims to provide astronauts with virtual assistance during space missions. Microsoft also works with NASA on data analysis and visualization tools, as well as cloud computing solutions to support the agency's scientific research and data processing needs. Additionally, Microsoft has partnered with NASA to develop educational programs and initiatives to inspire and engage students in STEM fields."

### LangChain Tools

You can set custom tools to be used, for example here is the bing search tool

In [140]:
# import os
# os.environ["BING_SUBSCRIPTION_KEY"] = ""
# os.environ["BING_SEARCH_URL"] = "https://api.bing.microsoft.com/v7.0/search"

# from langchain.utilities import BingSearchAPIWrapper
# search = BingSearchAPIWrapper(k=1)
# search.run("Dave Enright from Singapore on LinkedIn")

### You can use agents to orchestrate multiple tools together

Below the agent is being told to use the Bing Search tool when it needs to answer current events.The agent will use this tool based on the kinds of questions the user is asking.

In [141]:
# from langchain.agents import Tool, AgentExecutor, BaseSingleActionAgent
# from langchain.utilities import BingSearchAPIWrapper

# # The tool description helps langchain work out when to use the tool. Be careful with your descriptions as its not completely deterministic.
# search = BingSearchAPIWrapper(k=1)
# tools = [
#     Tool(
#         name = "Intermediate Answer",
#         func=search.run,
#         description="Use this when you need up to date information that is timely",
#         return_direct=True
#     )
# ]

Initialising the agent to use the tool

In [142]:
# from typing import List, Tuple, Any, Union
# from langchain.agents import initialize_agent
# from langchain.agents import AgentType

# # There are different kinds of agents documented on the langchain website. This particular one is basically just a chat with search agent.
# self_ask_with_search = initialize_agent(tools, llm, agent=AgentType.SELF_ASK_WITH_SEARCH, verbose=True)
# self_ask_with_search.run("Give me a list of to places to visit where weather is not too hot and do not have a lot of tourists")

In [143]:
# from langchain.agents import load_tools
# from langchain.agents import initialize_agent
# from langchain.agents import AgentType
# from langchain.llms import OpenAI

# llm = OpenAI(temperature=0)
# tools = load_tools(["bing-search", "llm-math"], llm=llm)

# agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
# agent.run("Who is Leo DiCaprio's girlfriend? What is her current age raised to the 0.43 power?")

### Work with Documents

In [144]:
from langchain.document_loaders import PyPDFLoader

# There are different document loaders available, this is the basic PDF loader.
loader = PyPDFLoader("Benefit_Options.pdf")
pages = loader.load_and_split()

In [145]:
# Splitting into pages is useful to give page number back as a source
pages[0]

Document(page_content='Contoso Electronics  \nPlan and Benefit Packages', metadata={'source': 'Benefit_Options.pdf', 'page': 0})

In [146]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

# This is creating an index based on embeddings, but as you can see by the output it's a bit ugly to work with.

faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings(chunk_size=1))
docs_pdf = faiss_index.similarity_search("What's the difference between plus and standard?", k=2)
for doc in docs_pdf:
    print(str(doc.metadata["page"]) + ":", doc.page_content[:300])

3: offers a wider range of prescription drug coverage than Northwind Standard. Both plans offer coverage 
for vision and dental services, as well as medical services.  
Next Steps  
We hope that this information has been helpful in understanding the differe nces between Northwind 
Health Plus and North
2: Welcome to Contoso  Electronics ! We are excited to offer our employees two comprehensive health 
insurance plans through Northwind Health.  
Northwind Health Plus  
Northwind Health Plus is a comprehensive plan that provides comprehensive coverage for medical, 
vision, and dental services. T his pl


In [147]:
# Does not work yet with Azure.
#from langchain.indexes import VectorstoreIndexCreator

# Using the vector store from the loader means langchain can abstract away a lot of the under the hood stuff.
# Note I am using a local python package for the vectorDB. You may get errors.
#index = VectorstoreIndexCreator().from_loaders([loader])

In [148]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

chain_pdf = load_qa_with_sources_chain(llm, chain_type="stuff")

query = "What's the difference between plus and standard?"
chain_pdf({"input_documents": docs_pdf, "question": query}, return_only_outputs=True)

{'output_text': 'The main difference between Northwind Health Plus and Northwind Standard is that Northwind Health Plus offers more comprehensive coverage. Northwind Health Plus provides coverage for emergency services, mental health and substance abuse, and out-of-network services, while Northwind Standard does not. Additionally, Northwind Health Plus offers a wider range of prescription drug coverage, including generic, brand-name, and specialty drugs, whereas Northwind Standard only covers generic and brand-name drugs. Both plans offer coverage for vision and dental services, as well as medical services. \nSOURCES: Benefit_Options.pdf'}

In [149]:
# Langchain CSV example

from langchain.agents import create_csv_agent
from langchain.llms import AzureOpenAI

agent = create_csv_agent(llm, 'sample.csv', verbose=True)

In [150]:
agent.run("how many rows are there?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To find the number of rows in the dataframe, I can use the `shape` attribute of the dataframe.

Action: python_repl_ast
Action Input: df.shape[0][0m
Observation: [36;1m[1;3m5[0m
Thought:[32;1m[1;3mThe dataframe has 5 rows.
Final Answer: 5[0m

[1m> Finished chain.[0m


'5'

In [151]:
from langchain.llms import AzureOpenAI
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter

# llm = AzureOpenAI(deployment_name="text-davinci-003", model_name="text-davinci-003", temperature=0)

text_splitter = CharacterTextSplitter()

with open("state_of_the_union.txt", encoding="utf8") as f:
    sotu = f.read()
texts = text_splitter.split_text(sotu)

In [152]:
from langchain.docstore.document import Document
docs_txt = [Document(page_content=t) for t in texts[:3]]

In [153]:
from langchain.chains.summarize import load_summarize_chain
chain_summarize_txt = load_summarize_chain(llm, chain_type="map_reduce")
chain_summarize_txt.run(docs_txt)

'President Biden addresses the conflict between Russia and Ukraine in his State of the Union address, emphasizing the unity and resolve of the United States and its allies in holding Russia accountable. The US is taking action against Russian oligarchs and corrupt leaders, implementing economic sanctions, and providing support to Ukraine. American forces are mobilized to defend NATO allies, not engaged in the conflict directly. The US is implementing targeted sanctions on the Russian economy and releasing oil reserves to help with gas prices. The speaker highlights the rise of democracies and the support for Ukraine worldwide. The American Rescue Plan is praised for providing relief and support to Americans, and the success of the plan in creating jobs and stimulating economic growth is highlighted. The need to invest in infrastructure is emphasized, with gratitude for bipartisan support for the infrastructure law.'

In [154]:
from langchain.chains.question_answering import load_qa_chain
chain_qa_txt = load_qa_chain(llm, chain_type="stuff")
query = "Did the president say selamat pagi in english?"
chain_qa_txt.run(input_documents=docs_txt, question=query)

'No, the President did not say "selamat pagi" in English.'

### Azure Cognitive Search

In [155]:
# test using cog search vector store.
# reference: https://python.langchain.com/docs/integrations/vectorstores/azuresearch
# !pip install --index-url=https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/ azure-search-documents==11.4.0a20230509004
# !pip install azure-identity
# !pip install azure-search-documents==11.4.0b6
# !pip install langchain --upgrade

In [156]:
import os, json
import openai
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import BaseRetriever
from langchain.vectorstores.azuresearch import AzureSearch

In [157]:
vector_store_address: str = 'https://globecogse.search.windows.net'
vector_store_password: str = 'Jk3aMWZNUeOdjiPZxDWgaWS4lXOY001YQcRTZXVUbZAzSeCiFO0l'
index_name: str = 'globevectorindex'

embeddings: OpenAIEmbeddings = OpenAIEmbeddings(chunk_size=1)  
vector_store_union: AzureSearch = AzureSearch(azure_search_endpoint=vector_store_address,  
                                        azure_search_key=vector_store_password,  
                                        index_name=index_name,  
                                        embedding_function=embeddings.embed_query) 


In [158]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
loader_cog_txt = TextLoader('state_of_the_union.txt', encoding='utf-8')
documents_union_txt = loader_cog_txt.load()

In [159]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_union_txt = text_splitter.split_documents(documents_union_txt)

Created a chunk of size 1434, which is longer than the specified 1000
Created a chunk of size 3063, which is longer than the specified 1000


In [160]:
vector_store_union.add_documents(documents=docs_union_txt)

['NmE5M2M2N2MtMjY2MC00NDllLTkyY2UtYjFjYjcxNjQ2ZjFj',
 'YTYzMTVhZDgtMjkzYy00ODA5LTg4Y2QtNzQ1YTM2NmQyMjVm',
 'ZGYzMTBlMGItYzk1ZS00ZDgwLWJlYjYtMDU0M2NhNDA3Y2Y4',
 'NjEyNjE1MzUtZjVhNC00M2MxLWIzYzAtMzFjMjQ3OWQwZjhj',
 'YWE0ZmM0MmEtNGJiYS00YzFjLTk4YjQtYmJiZDZkMGU3NGJl',
 'MDc2YzIxNGEtZWY3Yi00MDRjLWI5OTctMmQ2MmViMTJjMGNj',
 'NTE0OTdiY2EtNDcxNi00MjY0LWE3NTEtZWViYTc5OTBmMTA0',
 'ZTgzNzU3NDktMjMzOS00ZjFiLTlkYWItMzU3ZjJjMjViZTFh',
 'OWMwZWVhMDYtMGRjZC00ZDA1LWI0MDMtMTMzMDAyN2IwZWNh',
 'YzhhZWE5ZGItMGE2MC00ZTc0LWJkYzMtNWMwNmRhOGMwOWVk',
 'NzZhZjk4OGYtMjQ1MC00NDgyLTlkNzctNmM1OTc4ODI2Nzhj',
 'OTViOWU0MWQtMTBhMS00Mzg5LTk0NjQtNTU2MTFhZjQxOTll',
 'ZTdmY2RlNDAtMmZkNC00YWNmLTg1NDgtOGEwNjE2NWFjYmFi',
 'OGI1ZjhmOWMtY2NhMC00M2ZkLTk2NjYtNDdmYTJiMzFjZDc2',
 'ODQ1ZmU2OTctM2ZlYy00NjI1LTliYTUtMDhlZTUzZjNmZTNi',
 'NmFiMzQ0MDQtMGU1Ni00Nzc1LWFhODUtZWNjOGJjMDM5ZDY4',
 'NzE2MmE0MjYtZDc1Ny00ZDBmLWFhNGEtM2IwMGNmZWU3YmFj',
 'NzkxMzQ2OGItZWEyNy00MzkxLTkyNGYtYzNjMDVkOWQxMzIz',
 'ODM3ZThmZDgtYWVkYS00ZmMzLWJkYjktM2E2MWE1MDUy

In [161]:
# using retrievalQA
from langchain.chains import RetrievalQA
qa_cog_union = RetrievalQA.from_llm(llm=llm, retriever=vector_store_union.as_retriever(), verbose=False)
qa_cog_union.run('What did the president say about Ketanji Brown Jackson')

'The president said that he has nominated Ketanji Brown Jackson to serve on the United States Supreme Court.'

In [162]:
# using similarity search
docus_similar_union = vector_store_union.similarity_search(query="What did the president say about Ketanji Brown Jackson", k=3, search_type='similarity')
print(docus_similar_union[0].page_content)

And we all know—no matter what your ideology, we all know one of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. As I did 4 days ago, I've nominated a Circuit Court of Appeals—Ketanji Brown Jackson.


### VectoreStoreIndex (Not Working)

In [163]:
# # use custom index with existing vectorstore. This opens up the ability to do query_with_sources etc
# from langchain.indexes import VectorstoreIndexCreator

# vector_store: AzureSearch = AzureSearch(azure_search_endpoint=vector_store_address,  
#                                         azure_search_key=vector_store_password,  
#                                         index_name=index_name,  
#                                         embedding_function=embeddings.embed_query) 

# index_wrapper = VectorstoreIndexCreator(
#     vectorstore_cls=AzureSearch,
#     embedding=OpenAIEmbeddings(chunk_size=1),
#     text_splitter=CharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
# )

In [164]:
# query = "What did the president say about Ketanji Brown Jackson"
# index_wrapper.query_with_sources(query)

### Google Drive Loader

In [20]:
from langchain.document_loaders import GoogleDriveLoader

In [36]:
# loader_drive = GoogleDriveLoader(document_ids=["1s_-umZEhfcNVcyEZ6FbOi0pPIr-UEadEbz_OyAerpqc"],credentials_path='cred-globe.json')
# loader_drive = GoogleDriveLoader(document_ids=["15pprQ_7recgz1aKNkeITReDjGPVOywzm0jDqMKLHdJA"])

# docs_drive = loader_drive.load()
# print(f'{len(docs_drive)} document/s, {len(docs_drive[0].page_content)} characters')

In [33]:
# document question and aswering
from langchain.chains.question_answering import load_qa_chain
query = 'What are the tracks offered by the cadetship program?'
chain_drive = load_qa_chain(llm, chain_type='stuff', verbose=False)
chain_drive.run(input_documents=docs_drive,question=query)

'The tracks offered by the cadetship program are AI/ML, Software Development, Product Management and Delivery, Security, IT Quality Management and Testing, and Platform Management.'

In [6]:
# text splitting 
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter_drive = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs_split_drive = text_splitter_drive.split_documents(docs_drive)

for i,doc in enumerate(docs_split_drive):
    print(f'Document {i+1}: {len(docs_split_drive[i].page_content)} characters')

Document 1: 992 characters
Document 2: 546 characters
Document 3: 947 characters
Document 4: 913 characters
Document 5: 717 characters
Document 6: 964 characters
Document 7: 915 characters
Document 8: 769 characters


In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch

vector_store_address: str = 'https://globecogse.search.windows.net'
vector_store_password: str = 'Jk3aMWZNUeOdjiPZxDWgaWS4lXOY001YQcRTZXVUbZAzSeCiFO0l'
index_name: str = 'globevectorindex'

embeddings: OpenAIEmbeddings = OpenAIEmbeddings(chunk_size=1)  
vector_store_drive: AzureSearch = AzureSearch(azure_search_endpoint=vector_store_address,  
                                        azure_search_key=vector_store_password,  
                                        index_name=index_name,  
                                        embedding_function=embeddings.embed_query) 
                                        
vector_store_drive.add_documents(documents=docs_split_drive)

['ZjdjMGFkNzAtN2QyYi00Nzg0LTlkYmQtODBhZDkzZTk5ZDk3',
 'NWYzNWQwMWMtYWM3MC00MTAwLTk4ZTEtZTA1MzcwNTBjMzc1',
 'OGIwMmFiNmQtY2YyNC00YTI5LTkxNjItNGJkMzBjYTVmMjgz',
 'ODdiZGIwOTYtYjEwNS00YzhlLThjZmUtMDcyZWIwYzQ4MzQz',
 'ZmIyYTQ1NjItZTdlYS00OTU3LTg4MTMtZjljZjIwZDZkODg2',
 'NzAwMmU4NDEtZTEzNi00ZGEyLTgwZjktMWFiYTUwZDNhODZh',
 'MzViZTdjM2EtNDY4NC00NzdiLTgyY2ItNzU5YThhNDQyODY1',
 'ZTE2NTFjNDktZGEwYS00NTk0LTliZGYtN2EzMjM1N2JmNDY2']

In [8]:
# from langchain.chains import RetrievalQA
qa_drive = RetrievalQA.from_llm(llm=llm, retriever=vector_store_drive.as_retriever(k=1), verbose=True)
qa_drive.run('How do I apply for the technical development program?')

NameError: name 'RetrievalQA' is not defined

In [13]:
docus_similar_drive = vector_store_drive.similarity_search(query="How do I apply for the technical development program?", k=3, search_type='similarity')

In [None]:
from langchain.chains import RetrievalQA
qa_drive = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store_drive.as_retriever(k=2),verbose=True)
qa_drive.run('How do I apply for the technical development program?')