<a href="https://colab.research.google.com/github/danielsteman/textifai/blob/notebooks/Langchain_Research_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import clear_output

In [None]:
!pip install -qU python-magic openai langchain pypdf chromadb tiktoken pinecone-client
clear_output()

In [None]:
import requests
import openai
import pinecone
from dotenv import load_dotenv


import langchain
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain import OpenAI, VectorDBQA
from langchain.chains import RetrievalQAWithSourcesChain, RetrievalQA
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.indexes import VectorstoreIndexCreator
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

import nltk
nltk.download('punkt')
nltk.download('stopwords')
import os

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from getpass import getpass

In [None]:
# Pinecone Config
PINECONE_API_KEY = getpass('Pinecone API Key:')
PINECONE_ENV = getpass('Pinecone Env:')

# OpenAI Config
os.environ['OPENAI_API_KEY'] = getpass('OpenAI API Key:')

# Semantic Scholar Config
S2_API_KEY = getpass('Semantic Scholar API Key:')

Pinecone API Key:··········
Pinecone Env:··········
OpenAI API Key:··········
Semantic Scholar API Key:··········


In [None]:
index_name = 'textifai'

# Initialize Pincone
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )
else:
    print("Index already exists")

In [None]:
index = pinecone.Index(index_name)

# Set up Functions

In [None]:
def search_literature(query, limit=20, fields=["title", "authors", "year", "openAccessPdf", "abstract"]):
    # space between the  query to be removed and replaced with +
    query = preprocess_query(query).replace(" ", "+")
    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={query}&limit={limit}&fields={",".join(fields)}'
    headers = {"Accept": "*/*", "x-api-key": S2_API_KEY}

    response = requests.get(url, headers=headers)
    return response.json()

In [None]:
def preprocess_query(query):
    query = query.lower()
    # remove stopwords from the query
    stopwords = set(nltk.corpus.stopwords.words("english"))
    query = " ".join([word for word in query.split() if word not in stopwords])
    return query

In [None]:
def load_documents_pdf(path):
    loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [None]:
def split_text(documents, chunk_size=1000, chunk_overlap=0):
    text_splitter = RecursiveCharacterTextSplitter(
                        chunk_size=chunk_size, 
                        chunk_overlap=chunk_overlap, 
                        separators=["\n\n", "\n", " ", ""]
                )

    texts = text_splitter.split_documents(documents)

    return texts

In [None]:
def create_embeddings(model='text-embedding-ada-002'):
    
    embeddings = OpenAIEmbeddings(
    openai_api_key=os.environ['OPENAI_API_KEY'], 
    model=model
    )

    #docsearch = Chroma.from_documents(texts, embeddings, metadatas=[{"source": str(i)} for i in range(len(texts))])
  
    return embeddings

In [None]:
# Create embeddings and index them in the Pinecone instance
documents = load_documents_pdf('/content/drive/MyDrive/Colab Notebooks/docs/')
docs = split_text(documents)
embed = create_embeddings()

In [None]:
# Set up vectorstore
vectorstore = Pinecone.from_documents(docs, embed, index_name=index_name)

In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 301}},
 'total_vector_count': 301}

# Test code for searching literature

In [None]:
query = 'Im searching for literature how explaineble ai helps employees understand ai?'
search_literature(query)

{'total': 1,
 'offset': 0,
 'data': [{'paperId': '09cbf4462e8e5f47505286401ed78dd078a0be20',
   'title': "How IT Influence Chinese ITP's Job Burnout in the COVID-19 epidemic outbreak period?",
   'abstract': "Since 2000, the speedy development of information technology (IT), Internet, information systems (IS), social networks service (SNS), and artificial intelligence (AI), is indeed a significant issue to support the enterprises to offer their products and services to their global customers. Meanwhile, adopting these technologies to help employees work also pushes IT development progress from the SARS crisis in 2003, to the COVID-19 epidemic outbreak period At the same time, the information systems (IS) is a core capability of an organization in controlling the critical downstream and upstream data. Therefore, how to develop a robust IS to support the globalization of enterprises has been the critical issue from the 2008 financial crisis to the COVID-19 epidemic outbreak period. Since

# Initializing the Conversational Agent

In [None]:
kb_question_prompt_template = """Use the following portion of a long document to see if any of the text is relevant to answer the question. 

{context}
Question: {question}
Relevant text, if any:"""
KB_QUESTION_PROMPT = PromptTemplate(
    template=kb_question_prompt_template, input_variables=["context", "question"]
)

kb_combine_prompt_template = """
You are a professional research assistant and reacts to questions in a professional manner. 

Given the following extracted parts of a scientific paper and a question.  
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
Create a final answer with references ("SOURCES")
ALWAYS return a "SOURCES" part at the end of your answer. Return sources as a list of strings, e.g. ["source1", "source2", ...]

QUESTION: {question}
=========
{summaries}
=========
FINAL ANSWER:"""

KB_COMBINE_PROMPT = PromptTemplate(
    template=kb_combine_prompt_template, input_variables=["summaries", "question"]
)

In [None]:
fa_question_prompt_template = """Use the following portion of a long document to see if any of the text is relevant to answer the question. 

{context}
Question: {question}
Relevant text, if any:"""
FA_QUESTION_PROMPT = PromptTemplate(
    template=fa_question_prompt_template, input_variables=["context", "question"]
)

fa_combine_prompt_template = """
You are a professional financial analyst and reacts to questions in a professional manner. 

Given the following extracted parts of a financial report and a question.  
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
Create a final answer with references ("SOURCES")

QUESTION: {question}
=========
{summaries}
=========
FINAL ANSWER:"""

FA_COMBINE_PROMPT = PromptTemplate(
    template=fa_combine_prompt_template, input_variables=["summaries", "question"]
)

In [None]:
# chat completion llm
llm = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)

# retrieval qa chain
kb_chain = load_qa_with_sources_chain(
    ChatOpenAI(
        model_name='gpt-3.5-turbo',
        temperature=0.0, 
        openai_api_key=os.environ["OPENAI_API_KEY"]
    ),
    chain_type="map_reduce",
    return_intermediate_steps=False,
    question_prompt=KB_QUESTION_PROMPT,
    combine_prompt=KB_COMBINE_PROMPT,
)

kb = RetrievalQA(combine_documents_chain=kb_chain, retriever=vectorstore.as_retriever())     

# retrieval qa chain
fa_chain = load_qa_with_sources_chain(
    ChatOpenAI(
        model_name='gpt-3.5-turbo',
        temperature=0.0, 
        openai_api_key=os.environ["OPENAI_API_KEY"]
    ),
    chain_type="map_reduce",
    return_intermediate_steps=False,
    question_prompt=FA_QUESTION_PROMPT,
    combine_prompt=FA_COMBINE_PROMPT,
)

fa = RetrievalQA(combine_documents_chain=fa_chain, retriever=vectorstore.as_retriever()) 

In [None]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Research Assistant',
        func=kb.run,
        description=(
            'use this tool when answering general knowledge queries about scientific or research related topics to get more information'
        )
    ), 
      Tool(
        name='Financial Analyst',
        func=fa.run,
        description=(
            'use this tool when answering queestion about finances, financial statement or financial questions in general'
        )
    )
]

In [None]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

In [None]:
agent("Hello, my name is Thijs")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "Hello Thijs! How can I assist you today?"
}[0m

[1m> Finished chain.[0m


{'input': 'Hello, my name is Thijs',
 'chat_history': [],
 'output': 'Hello Thijs! How can I assist you today?'}

In [None]:
agent("In which Patagonia plans to be carbon neutral?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Research Assistant",
    "action_input": "Patagonia carbon neutral plan"
}[0m
Observation: [36;1m[1;3mThere is no relevant information in the provided document about Patagonia's carbon neutral plan. 
SOURCES: /content/drive/MyDrive/Colab Notebooks/docs/Patagonia-2021-BCorp-Report-Updated-2-15-22.pdf[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "I'm sorry, but I was unable to find any information about Patagonia's carbon neutral plan in the provided document. It's possible that more information is available elsewhere, or that Patagonia has not yet released a public plan for achieving carbon neutrality." 
}[0m

[1m> Finished chain.[0m


{'input': 'In which Patagonia plans to be carbon neutral?',
 'chat_history': [HumanMessage(content='Hello, my name is Thijs', additional_kwargs={}, example=False),
  AIMessage(content='Hello Thijs! How can I assist you today?', additional_kwargs={}, example=False)],
 'output': "I'm sorry, but I was unable to find any information about Patagonia's carbon neutral plan in the provided document. It's possible that more information is available elsewhere, or that Patagonia has not yet released a public plan for achieving carbon neutrality."}

In [None]:
agent("What was its revenue?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Financial Analyst",
    "action_input": "What was Patagonia's revenue in the last fiscal year?"
}[0m
Observation: [33;1m[1;3mI'm sorry, but there is no information available in the provided financial report to answer the question about Patagonia's revenue in the last fiscal year. No revenue figures were mentioned in the report. 
Source: /content/drive/MyDrive/Colab Notebooks/docs/Patagonia-2021-BCorp-Report-Updated-2-15-22.pdf[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "I'm sorry, but there is no information available in the provided financial report to answer the question about Patagonia's revenue in the last fiscal year. No revenue figures were mentioned in the report."
}[0m

[1m> Finished chain.[0m


{'input': 'What was its revenue?',
 'chat_history': [HumanMessage(content='Hello, my name is Thijs', additional_kwargs={}, example=False),
  AIMessage(content='Hello Thijs! How can I assist you today?', additional_kwargs={}, example=False),
  HumanMessage(content='In which Patagonia plans to be carbon neutral?', additional_kwargs={}, example=False),
  AIMessage(content="I'm sorry, but I was unable to find any information about Patagonia's carbon neutral plan in the provided document. It's possible that more information is available elsewhere, or that Patagonia has not yet released a public plan for achieving carbon neutrality.", additional_kwargs={}, example=False)],
 'output': "I'm sorry, but there is no information available in the provided financial report to answer the question about Patagonia's revenue in the last fiscal year. No revenue figures were mentioned in the report."}

In [None]:
agent("How many grants did Patagonia made to groups fighting to protect lands, waters and wildlife?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Research Assistant",
    "action_input": "How many grants did Patagonia make to groups fighting to protect lands, waters and wildlife in 2021?"
}[0m
Observation: [36;1m[1;3mBased on the available information, there is no text in the Patagonia 2021 BCorp Report that provides the number of grants made to groups fighting to protect lands, waters and wildlife specifically in 2021. The report only provides information about grants and donations made in the past. 
SOURCES: 
- /content/drive/MyDrive/Colab Notebooks/docs/Patagonia-2021-BCorp-Report-Updated-2-15-22.pdf[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "There is no information available in the Patagonia 2021 BCorp Report about the number of grants made to groups fighting to protect lands, waters and wildlife specifically in 2021."
}[0m

[1m> Finished chain.[0m


{'input': 'How many grants did Patagonia made to groups fighting to protect lands, waters and wildlife in 2021?',
 'chat_history': [HumanMessage(content='Hello, my name is Thijs', additional_kwargs={}, example=False),
  AIMessage(content='Hello Thijs! How can I assist you today?', additional_kwargs={}, example=False)],
 'output': 'There is no information available in the Patagonia 2021 BCorp Report about the number of grants made to groups fighting to protect lands, waters and wildlife specifically in 2021.'}

In [None]:
agent("Then how many grants did Patagonia made to groups fighting to protect lands, waters and wildlife?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Research Assistant",
    "action_input": "How many grants did Patagonia make to groups fighting to protect lands, waters and wildlife?"
}
```[0m
Observation: [36;1m[1;3mPatagonia made 705 grants to groups fighting to protect lands, waters and wildlife in FY21, totaling $5,700,000+. 
SOURCES: Patagonia-2021-BCorp-Report-Updated-2-15-22.pdf[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Patagonia made 705 grants to groups fighting to protect lands, waters and wildlife in FY21, totaling $5,700,000+."
}
```[0m

[1m> Finished chain.[0m


{'input': 'Then how many grants did Patagonia made to groups fighting to protect lands, waters and wildlife?',
 'chat_history': [HumanMessage(content='Hello, my name is Thijs', additional_kwargs={}, example=False),
  AIMessage(content='Hello Thijs! How can I assist you today?', additional_kwargs={}, example=False),
  HumanMessage(content='How many grants did Patagonia made to groups fighting to protect lands, waters and wildlife in 2021?', additional_kwargs={}, example=False),
  AIMessage(content='There is no information available in the Patagonia 2021 BCorp Report about the number of grants made to groups fighting to protect lands, waters and wildlife specifically in 2021.', additional_kwargs={}, example=False)],
 'output': 'Patagonia made 705 grants to groups fighting to protect lands, waters and wildlife in FY21, totaling $5,700,000+.'}

In [None]:
agent("What are typical words that are used in job advertisements that are seen as commumal and which words are seen as agentic?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Research Assistant",
    "action_input": "What are typical words used in job advertisements that are seen as communal and which words are seen as agentic?"
}[0m
Observation: [36;1m[1;3mUnfortunately, specific words used in job advertisements that are seen as communal or agentic are not discussed in the sources provided. However, published lists of agentic and communal words include individualistic, competitive, committed, caring, supportive, and responsible. It is important to balance communal and agentic traits in job requirements to minimize gender biases. 
SOURCES: 
- Horvath, L. K., & Sczesny, S. (2015). Reducing gender bias in a traditionally male dominated field: How language reforms can contribute to a more balanced representation of women in STEM. Frontiers in psychology, 6, 812. 
- Gaucher, D., Friesen, J., & Kay, A. C. (2011). Evidence that gendered wording in job advertisements exists and sustai

{'input': 'What are typical words that are used in job advertisements that are seen as commumal and which words are seen as agentic?',
 'chat_history': [HumanMessage(content='Hello, my name is Thijs', additional_kwargs={}, example=False),
  AIMessage(content='Hello Thijs! How can I assist you today?', additional_kwargs={}, example=False),
  HumanMessage(content='How many grants did Patagonia made to groups fighting to protect lands, waters and wildlife in 2021?', additional_kwargs={}, example=False),
  AIMessage(content='There is no information available in the Patagonia 2021 BCorp Report about the number of grants made to groups fighting to protect lands, waters and wildlife specifically in 2021.', additional_kwargs={}, example=False),
  HumanMessage(content='Then how many grants did Patagonia made to groups fighting to protect lands, waters and wildlife?', additional_kwargs={}, example=False),
  AIMessage(content='Patagonia made 705 grants to groups fighting to protect lands, waters 