## Integrating Google Search with LangChain for Intelligent Query Handling

In [74]:
import os
import re
from langchain.docstore.document import Document
from langchain import hub
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

In [64]:

llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.7,
    max_tokens=None,
    timeout=None,
    max_retries=2,
  )

In [49]:
os.environ["GOOGLE_CSE_ID"] = ""
os.environ["GOOGLE_API_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""
 

In [84]:
prompt = hub.pull("hwchase17/openai-tools-agent")   

In [87]:
prompt

ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, metadata={'lc_hub_owner': 'hwchase17', 'lc_hub_repo': 'openai-tools-agent', 'lc_hub_commit_hash': 'c18672812789a3b9697656dd539edf0120285dcae36396d0b548ae42a4ed66f5'}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')), MessagesPlacehold

In [51]:
openai_embeddings = OpenAIEmbeddings()

In [7]:
def clean_text(raw_text):
    cleaned_text = re.sub(r'\n+', ' ', raw_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = ''.join(char for char in raw_text if ord(char) < 128)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text.strip()


In [10]:
search = GoogleSearchAPIWrapper(k = 6)

In [15]:
def top5_results(query):
    return search.results(query, 6)


In [83]:
 
def get_urls_of_results(results):
    urls = []
    for each_result in results:
        urls.append(each_result['link'])
    return urls


In [71]:
def get_page_contents_from_url(urls):
    page_contents = []
    loader = UnstructuredURLLoader(urls=urls)
    data = loader.load()
    
    for each_data in data:
        page_contents.append(each_data.page_content)
    return page_contents



In [73]:
def get_documents(page_contents):
    cleaned_text = ""    
    for each_page_content in page_contents:
        cleaned_text += clean_text(each_page_content)
     
    documents = [Document(page_content = cleaned_text)]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80)
    documents = text_splitter.split_documents(documents)
    return documents


In [75]:
def retriever_tool(documents):
    retriever = FAISS.from_documents(documents, openai_embeddings).as_retriever(
     search_kwargs={"k": 7}
 )
    tool = create_retriever_tool(
         retriever,        
         "fetch_data_from_documents",
         """your task is to retrieve information from the provided documents relevant to the queries I ask. Retrieve the most relevant information from the entire corpus of documents.""",
     )
    tools = [tool]
    return tools
    
    
    

In [78]:
def agent_executor(tools):

    agent = create_openai_tools_agent(llm, tools, prompt)
        
    executor = AgentExecutor.from_agent_and_tools(
             agent=agent,
             tools=tools,
             return_intermediate_steps=True,
             handle_parsing_errors=True,
         )
    return executor


In [96]:
query = "what are the teams qualified for t20 world cup"

results = top5_results(query)
urls = get_urls_of_results(results)
page_contents = get_page_contents_from_url(urls)
documents = get_documents(page_contents)
tools = retriever_tool(documents)
executor = agent_executor(tools)

In [97]:
result = executor.invoke({"input": query})    

In [82]:
result['output']

'The teams that have qualified for the T20 World Cup Super 8 stage are India, Australia, Afghanistan, South Africa, USA, and West Indies.'

In [None]:
for each in prompt[0]:
    each[-1].template = "You are a helpful AI assistant, your task is generate descriptive answers for the given query from the provided documents"
    break

In [98]:
result['output']

'The teams that have qualified for the T20 World Cup Super 8 stage are India, Australia, Afghanistan, South Africa, USA, and West Indies. These teams secured their places in the Super 8 by finishing in the top two positions in their respective groups during the tournament.'