# Wiki Assistant

In [11]:
import os
import time
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
from operator import itemgetter

# langchain 
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chat_models import ChatOpenAI
from langchain import hub
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableParallel
from langchain.prompts import PromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.messages import AIMessage, HumanMessage

from pinecone_datasets import load_dataset
import pinecone

# internal imports
from utils import chunks

# Step1: Load data

In [2]:
dataset = load_dataset('wikipedia-simple-text-embedding-ada-002-100K')
dataset.head()

Unnamed: 0,id,values,sparse_values,metadata,blob
0,1-0,"[-0.011254455894231796, -0.01698738895356655, ...",,,"{'chunk': 0, 'source': 'https://simple.wikiped..."
1,1-1,"[-0.0015197008615359664, -0.007858820259571075...",,,"{'chunk': 1, 'source': 'https://simple.wikiped..."
2,1-2,"[-0.009930099360644817, -0.012211072258651257,...",,,"{'chunk': 2, 'source': 'https://simple.wikiped..."
3,1-3,"[-0.011600767262279987, -0.012608098797500134,...",,,"{'chunk': 3, 'source': 'https://simple.wikiped..."
4,1-4,"[-0.026462381705641747, -0.016362832859158516,...",,,"{'chunk': 4, 'source': 'https://simple.wikiped..."


In [18]:
document_df: pd.DataFrame = dataset.documents
document_df

Unnamed: 0,id,values,sparse_values,metadata,blob
0,1-0,"[-0.011254455894231796, -0.01698738895356655, ...",,,"{'chunk': 0, 'source': 'https://simple.wikiped..."
1,1-1,"[-0.0015197008615359664, -0.007858820259571075...",,,"{'chunk': 1, 'source': 'https://simple.wikiped..."
2,1-2,"[-0.009930099360644817, -0.012211072258651257,...",,,"{'chunk': 2, 'source': 'https://simple.wikiped..."
3,1-3,"[-0.011600767262279987, -0.012608098797500134,...",,,"{'chunk': 3, 'source': 'https://simple.wikiped..."
4,1-4,"[-0.026462381705641747, -0.016362832859158516,...",,,"{'chunk': 4, 'source': 'https://simple.wikiped..."
...,...,...,...,...,...
99995,234934-0,"[0.004071333445608616, 0.007617993280291557, -...",,,"{'chunk': 0, 'source': 'https://simple.wikiped..."
99996,234940-0,"[-0.019249631091952324, 0.0005083759315311909,...",,,"{'chunk': 0, 'source': 'https://simple.wikiped..."
99997,234940-1,"[-0.009684711694717407, 0.005518495570868254, ...",,,"{'chunk': 1, 'source': 'https://simple.wikiped..."
99998,234940-2,"[-0.012945059686899185, 0.004518118686974049, ...",,,"{'chunk': 2, 'source': 'https://simple.wikiped..."


In [48]:
# save to pickle file in order to keep types
document_df.to_pickle('./data/wikipedia-simple-text-embedding-ada-002-100K.pkl')

In [53]:
df = pd.read_pickle('./data/wikipedia-simple-text-embedding-ada-002-100K.pkl')
df.drop(columns=['sparse_values', 'metadata'], inplace=True)
df.rename(columns={'blob': 'metadata'}, inplace=True)
for col in df.columns:
    print(f"{col}: {type(df[col][0])}")
df.head()

id: <class 'str'>
values: <class 'numpy.ndarray'>
metadata: <class 'dict'>


Unnamed: 0,id,values,metadata
0,1-0,"[-0.011254455894231796, -0.01698738895356655, ...","{'chunk': 0, 'source': 'https://simple.wikiped..."
1,1-1,"[-0.0015197008615359664, -0.007858820259571075...","{'chunk': 1, 'source': 'https://simple.wikiped..."
2,1-2,"[-0.009930099360644817, -0.012211072258651257,...","{'chunk': 2, 'source': 'https://simple.wikiped..."
3,1-3,"[-0.011600767262279987, -0.012608098797500134,...","{'chunk': 3, 'source': 'https://simple.wikiped..."
4,1-4,"[-0.026462381705641747, -0.016362832859158516,...","{'chunk': 4, 'source': 'https://simple.wikiped..."


# Step2: Create index

In [12]:
pinecone_key = os.getenv('PINECONE_API_KEY')

pinecone.init(api_key=pinecone_key, environment='gcp-starter')
index_name = 'wiki-pages'
if index_name not in pinecone.list_indexes():
    pinecone.create_index(name=index_name, metric='cosine', dimension=1536)  # returned dim from text-embeddings-ada-002 openai 
    time.sleep(2)
    print(f" Created: {pinecone.list_indexes()[0]}")   # there is only one index in this starter plan
else:
    print(f'{index_name} already exists')
index = pinecone.Index(index_name)
index.describe_index_stats()

wiki-pages already exists


{'dimension': 1536,
 'index_fullness': 1.0,
 'namespaces': {'': {'vector_count': 100000}},
 'total_vector_count': 100000}

### Step2.1: upsert data

In [60]:
# convert np.ndarray to list
df['values'] = df['values'].apply(lambda x: x.tolist())

batch_number = 0
for chunk in chunks(df, batch_size=100):
    batch_number += 1
    print(f"Batch number: {batch_number}")
    index.upsert(vectors=chunk.to_dict(orient='records'))
    
time.sleep(2)

Batch number: 1
Batch number: 2
Batch number: 3
Batch number: 4
Batch number: 5
Batch number: 6
Batch number: 7
Batch number: 8
Batch number: 9
Batch number: 10
Batch number: 11
Batch number: 12
Batch number: 13
Batch number: 14
Batch number: 15
Batch number: 16
Batch number: 17
Batch number: 18
Batch number: 19
Batch number: 20
Batch number: 21
Batch number: 22
Batch number: 23
Batch number: 24
Batch number: 25
Batch number: 26
Batch number: 27
Batch number: 28
Batch number: 29
Batch number: 30
Batch number: 31
Batch number: 32
Batch number: 33
Batch number: 34
Batch number: 35
Batch number: 36
Batch number: 37
Batch number: 38
Batch number: 39
Batch number: 40
Batch number: 41
Batch number: 42
Batch number: 43
Batch number: 44
Batch number: 45
Batch number: 46
Batch number: 47
Batch number: 48
Batch number: 49
Batch number: 50
Batch number: 51
Batch number: 52
Batch number: 53
Batch number: 54
Batch number: 55
Batch number: 56
Batch number: 57
Batch number: 58
Batch number: 59
Batch 

In [3]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 1.0,
 'namespaces': {'': {'vector_count': 100000}},
 'total_vector_count': 100000}

# Step3: Query

In [13]:
openai_key = os.getenv('OPENAI_API_KEY')
embeddings = OpenAIEmbeddings(openai_api_key=openai_key)

# loading aleady created index
docsearch = Pinecone.from_existing_index(index_name, embeddings)
query= "Tell me about the Brazilian researcher Marcos Kalinowski"
docs = docsearch.similarity_search(query=query, k=5)
docs

[Document(page_content='Marcos Alberto Skavinski (born 28 March 1975) is a Brazilian football player. He plays for Goiás.\n\nClub career statistics\n\n|-\n|2006||Kawasaki Frontale||J. League 1||28||2||2||0||9||1||39||3\n28||2||2||0||9||1||39||3\n28||2||2||0||9||1||39||3\n|}\n\n1975 births\nLiving people\nBrazilian footballers\nPeople from Curitiba', metadata={'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Marcos%20Alberto%20Skavinski', 'title': 'Marcos Alberto Skavinski', 'wiki-id': '200643'}),
 Document(page_content='Marcos Gomes de Araujo (born 23 March 1976) is a Brazilian football player. He plays for Kashima Antlers.\n\nClub career statistics', metadata={'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Marcos%20Gomes%20de%20Araujo', 'title': 'Marcos Gomes de Araujo', 'wiki-id': '199408'}),
 Document(page_content='Marcelo Baron Polanczyk (born 19 January 1974) is a former Brazilian football player.\n\nClub career statistics', metadata={'chunk': 0.0, 'source': 

In [14]:
for doc in docs:
    print(f"Title: {doc.metadata['title']}")
    print(doc.page_content)
    print('-------------------')

Title: Marcos Alberto Skavinski
Marcos Alberto Skavinski (born 28 March 1975) is a Brazilian football player. He plays for Goiás.

Club career statistics

|-
|2006||Kawasaki Frontale||J. League 1||28||2||2||0||9||1||39||3
28||2||2||0||9||1||39||3
28||2||2||0||9||1||39||3
|}

1975 births
Living people
Brazilian footballers
People from Curitiba
-------------------
Title: Marcos Gomes de Araujo
Marcos Gomes de Araujo (born 23 March 1976) is a Brazilian football player. He plays for Kashima Antlers.

Club career statistics
-------------------
Title: Marcelo Baron Polanczyk
Marcelo Baron Polanczyk (born 19 January 1974) is a former Brazilian football player.

Club career statistics
-------------------
Title: Warwick Estevam Kerr
Warwick Estevam Kerr (born September 9, 1922, Santana do Parnaíba, São Paulo, Brazil- 15 September 2018) is a Brazilian engineer, geneticist, entomologist, and professor. He made many discoveries in the genetics and sex determination of bees. He is also responsible 

# Step4: Retrieve
search_type (Optional[str]) – Defines the type of search that the Retriever should perform. Can be:
 - “similarity” (default), 
 - “mmr”, or 
 - “similarity_score_threshold”.

search_kwargs (Optional[Dict]) – Keyword arguments to pass to the search function. Can include things like:

- k: Amount of documents to return (Default: 4),
- score_threshold: Minimum relevance threshold for similarity_score_threshold
- fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) 
- lambda_mult: Diversity of results returned by MMR; 1 for minimum diversity and 0 for maximum. (Default: 0.5)
- filter: Filter by document metadata

In [15]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.get_relevant_documents(query=query)
retrieved_docs

[Document(page_content='Marcos Alberto Skavinski (born 28 March 1975) is a Brazilian football player. He plays for Goiás.\n\nClub career statistics\n\n|-\n|2006||Kawasaki Frontale||J. League 1||28||2||2||0||9||1||39||3\n28||2||2||0||9||1||39||3\n28||2||2||0||9||1||39||3\n|}\n\n1975 births\nLiving people\nBrazilian footballers\nPeople from Curitiba', metadata={'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Marcos%20Alberto%20Skavinski', 'title': 'Marcos Alberto Skavinski', 'wiki-id': '200643'}),
 Document(page_content='Marcos Gomes de Araujo (born 23 March 1976) is a Brazilian football player. He plays for Kashima Antlers.\n\nClub career statistics', metadata={'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Marcos%20Gomes%20de%20Araujo', 'title': 'Marcos Gomes de Araujo', 'wiki-id': '199408'}),
 Document(page_content='Marcelo Baron Polanczyk (born 19 January 1974) is a former Brazilian football player.\n\nClub career statistics', metadata={'chunk': 0.0, 'source': 

In [16]:
for doc in retrieved_docs:
    print(f"Title: {doc.metadata['title']}")
    print(doc.page_content)
    print('-------------------')

Title: Marcos Alberto Skavinski
Marcos Alberto Skavinski (born 28 March 1975) is a Brazilian football player. He plays for Goiás.

Club career statistics

|-
|2006||Kawasaki Frontale||J. League 1||28||2||2||0||9||1||39||3
28||2||2||0||9||1||39||3
28||2||2||0||9||1||39||3
|}

1975 births
Living people
Brazilian footballers
People from Curitiba
-------------------
Title: Marcos Gomes de Araujo
Marcos Gomes de Araujo (born 23 March 1976) is a Brazilian football player. He plays for Kashima Antlers.

Club career statistics
-------------------
Title: Marcelo Baron Polanczyk
Marcelo Baron Polanczyk (born 19 January 1974) is a former Brazilian football player.

Club career statistics
-------------------
Title: Warwick Estevam Kerr
Warwick Estevam Kerr (born September 9, 1922, Santana do Parnaíba, São Paulo, Brazil- 15 September 2018) is a Brazilian engineer, geneticist, entomologist, and professor. He made many discoveries in the genetics and sex determination of bees. He is also responsible 

# Step5: Generate
__Prompt definition__: "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"

In [17]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
prompt = hub.pull("rlm/rag-prompt")
print(
    prompt.invoke(
        {"context": "filler context", "question": "filler question"}
    ).to_string()
)

Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: filler question 
Context: filler context 
Answer:


In [18]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [36]:
for chunk in rag_chain.stream("Tell me about how we landed on the moon"):
    print(chunk, end="", flush=True)

Apollo 11 was the first flight to send people to the moon. It was done by NASA and carried three astronauts: Neil Armstrong, Buzz Aldrin, and Michael Collins. Armstrong and Aldrin became the first humans to land on the moon on July 20, 1969. They conducted experiments, collected moon rocks, and set up an American flag before returning to Earth.

### Now with a LLama 2 7B

In [31]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
#from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp, Ollama

In [33]:
#llama_llm = Ollama(model="llama2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
llama_llm = Ollama(model="llama2")

llama_llm("What is the capital of France?")

'The capital of France is Paris.'

In [34]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain_llama = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llama_llm
    | StrOutputParser()
)

In [35]:
for chunk in rag_chain_llama.stream("Tell me about how we landed on the moon"):
    print(chunk, end="", flush=True)

The Apollo program was a project by NASA to send humans to explore the Moon and bring them back to Earth safely. The program started in 1961 due to the Soviet Union's success in sending a person into outer space during the Cold War. The spacecraft used for the Apollo missions were made up of a Command and Service Module and a Lunar Module, which docked on the way to the Moon. The program consisted of several mission types, each testing specific parts and tasks, with the goal of landing a person on the Moon and returning them safely to Earth.
Some conspiracy theories claim that the Apollo Moon landings were faked or hoaxed, but there is no credible evidence to support these claims. The rocks collected during the Apollo missions have been extensively studied by scientists around the world, and they are found to be very different from meteorites and other rocks found on Earth. In addition, the Lunar Reconnaissance Orbiter was able to photograph the Apollo 14 landing site in 2009, clearly 

# Customized prompt

In [20]:
template = """Use the following pieces of context to make an email answering the question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Start every email with Dear [Name] , and end it respectfully [Name].
keep the answer as concise as possible.
{context}
Question: {question}
Email Answer:"""
rag_prompt_custom = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)

In [21]:
for chunk in rag_chain.stream("was there a diplomatic situation between Britains' war declaration and the germany invasion of Poland, between Great britain and Germany during the 30s?"):
    print(chunk, end="", flush=True)

Dear [Name],

Yes, there was a diplomatic situation between Great Britain and Germany during the 1930s. After Germany's invasion of Poland, Britain and France warned Germany to move all soldiers out of Poland or there would be war. Germany did not respond, and as a result, Britain and France declared war on Germany on September 3, 1939.
Thank you.
Respectfully,
[Your Name]

# Adding Source to the Response

In [22]:
rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)
rag_chain_with_source = RunnableParallel(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "documents": lambda input: [doc.metadata for doc in input["documents"]],
    "answer": rag_chain_from_docs,
}

rag_chain_with_source.invoke("What are the biggest oil companies?")

{'documents': [{'chunk': 0.0,
   'source': 'https://simple.wikipedia.org/wiki/ExxonMobil',
   'title': 'ExxonMobil',
   'wiki-id': '37649'},
  {'chunk': 0.0,
   'source': 'https://simple.wikipedia.org/wiki/Shell%20Oil%20Company',
   'title': 'Shell Oil Company',
   'wiki-id': '72099'},
  {'chunk': 0.0,
   'source': 'https://simple.wikipedia.org/wiki/Petroleum%20industry',
   'title': 'Petroleum industry',
   'wiki-id': '126836'},
  {'chunk': 0.0,
   'source': 'https://simple.wikipedia.org/wiki/List%20of%20oil%20fields',
   'title': 'List of oil fields',
   'wiki-id': '4806'},
  {'chunk': 0.0,
   'source': 'https://simple.wikipedia.org/wiki/List%20of%20oil-producing%20nations',
   'title': 'List of oil-producing nations',
   'wiki-id': '4813'},
  {'chunk': 0.0,
   'source': 'https://simple.wikipedia.org/wiki/List%20of%20companies%20based%20in%20Tulsa%2C%20Oklahoma',
   'title': 'List of companies based in Tulsa, Oklahoma',
   'wiki-id': '132570'}],
 'answer': 'Dear [Name],\n\nThe bigges

In [23]:
for chunk in rag_chain_with_source.stream("What are the biggest oil companies?"):
    print(chunk, end="", flush=True)

{'documents': [{'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/ExxonMobil', 'title': 'ExxonMobil', 'wiki-id': '37649'}, {'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Shell%20Oil%20Company', 'title': 'Shell Oil Company', 'wiki-id': '72099'}, {'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Petroleum%20industry', 'title': 'Petroleum industry', 'wiki-id': '126836'}, {'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/List%20of%20oil%20fields', 'title': 'List of oil fields', 'wiki-id': '4806'}, {'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/List%20of%20oil-producing%20nations', 'title': 'List of oil-producing nations', 'wiki-id': '4813'}, {'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/List%20of%20companies%20based%20in%20Tulsa%2C%20Oklahoma', 'title': 'List of companies based in Tulsa, Oklahoma', 'wiki-id': '132570'}]}{'answer': ''}{'answer': 'Dear'}{'answer': ' ['}{'answer': 'Name'}{'answer': '],\n\n'}{'answer': 'The'}

# Adding Memory 

In [24]:
condense_q_system_prompt = """Given a chat history and the latest user question \
which might reference the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
condense_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", condense_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
condense_q_chain = condense_q_prompt | llm | StrOutputParser()

In [25]:
condense_q_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What does LLM stand for?"),
            AIMessage(content="Large language model"),
        ],
        "question": "What is meant by large",
    }
)

'What is the definition of "large" in the context of a language model?'

In [26]:
condense_q_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What does LLM stand for?"),
            AIMessage(content="Large language model"),
        ],
        "question": "How do transformers work",
    }
)

'How do transformer models function?'

In [27]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def condense_question(input: dict):
    if input.get("chat_history"):
        return condense_q_chain
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(context=condense_question | retriever | format_docs)
    | qa_prompt
    | llm
)

In [28]:
chat_history = []

question = "Was there a conflict involving Australia and its neighbor countries for the last 100 years?"
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])

second_question = "Any of those countries had any other kind of conflict?"
rag_chain.invoke({"question": second_question, "chat_history": chat_history})

AIMessage(content='Yes, several of those countries have had conflicts with each other or within their own borders. For example, in the last 100 years, there have been conflicts such as the Korean War between North and South Korea, the Vietnam War between North and South Vietnam, and the ongoing conflicts in Afghanistan and Iraq. Additionally, countries like Rwanda, Congo, Liberia, and Ivory Coast in West Africa have experienced internal conflicts and civil wars.')