In [None]:
#pip install datasets

In [1]:
from datasets import load_dataset

In [2]:
data = load_dataset("wikipedia", "20220301.simple", split='train[:10]', trust_remote_code=True)
data

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10
})

In [3]:
import pandas as pd

In [4]:
# Convert to DataFrame
df = pd.DataFrame(data)

In [5]:
df.sample(10)

Unnamed: 0,id,url,title,text
2,6,https://simple.wikipedia.org/wiki/Art,Art,Art is a creative activity that expresses imag...
7,14,https://simple.wikipedia.org/wiki/Alanis%20Mor...,Alanis Morissette,"Alanis Nadine Morissette (born June 1, 1974) i..."
1,2,https://simple.wikipedia.org/wiki/August,August,August (Aug.) is the eighth month of the year ...
0,1,https://simple.wikipedia.org/wiki/April,April,April is the fourth month of the year in the J...
9,18,https://simple.wikipedia.org/wiki/Andouille,Andouille,Andouille is a type of pork sausage. It is spi...
8,17,https://simple.wikipedia.org/wiki/Adobe%20Illu...,Adobe Illustrator,Adobe Illustrator is a computer program for ma...
4,9,https://simple.wikipedia.org/wiki/Air,Air,Air refers to the Earth's atmosphere. Air is a...
3,8,https://simple.wikipedia.org/wiki/A,A,A or a is the first letter of the English alph...
5,12,https://simple.wikipedia.org/wiki/Autonomous%2...,Autonomous communities of Spain,Spain is divided in 17 parts called autonomous...
6,13,https://simple.wikipedia.org/wiki/Alan%20Turing,Alan Turing,"Alan Mathison Turing OBE FRS (London, 23 June ..."


In [6]:
df.columns

Index(['id', 'url', 'title', 'text'], dtype='object')

In [7]:
# Save to CSV
df.to_csv('wikipedia_data.csv', index=False)

In [8]:
data[6]

{'id': '13',
 'url': 'https://simple.wikipedia.org/wiki/Alan%20Turing',
 'title': 'Alan Turing',
 'text': 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dub

In [9]:
#!pip install langchain

In [10]:
#!pip install openai

In [11]:
#!pip install pinecone-client

In [12]:
#!pip install tiktoken

Every record contains a lot of text. Our first task is therefore to identify a good preprocessing methodology for chunking these articles into more "concise" chunks to later be embedding and stored in our Pinecone vector database.

In [9]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [10]:
tokenizer = tiktoken.get_encoding('cl100k_base')

In [11]:
# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [12]:
tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

26

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size     =400,
    chunk_overlap  =20,
    length_function=tiktoken_len,
    separators     =["\n\n", "\n", " ", ""]
)

In [15]:
chunks = text_splitter.split_text(data[6]['text'])[:3]
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

In [16]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])

(299, 323, 382)

In [17]:
import os

In [18]:
#pip install -U langchain-community

In [19]:
from langchain_openai import OpenAIEmbeddings

In [20]:
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    #openai_api_key=api_key
)

Now we embed some text like so:

In [21]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]


In [22]:
res = embed.embed_documents(texts)
len(res), len(res[0])

(2, 1536)

From this we get two (aligning to our two chunks of text) 1536-dimensional embeddings.

#### Vector Database

In [23]:
from pinecone import Pinecone

In [24]:
pc_api_key = os.getenv('PINECONE_API_KEY')

In [25]:
# configure client
pc = Pinecone(api_key=pc_api_key)

PineconeConfigurationError: You haven't specified an Api-Key.

In [None]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [None]:
import time

index_name = 'langchain-retrieval-augmentation'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

In [None]:
len(data)

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts     = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

In [None]:
index.describe_index_stats()

#### Creating a Vector Store and Querying
Now that we've build our index we can switch back over to LangChain. We start by initializing a vector store using the same index we just built. We do that like so:

In [None]:
from langchain.vectorstores import Pinecone

In [None]:
text_field = "text"  # the metadata field that contains our text

In [None]:
# initialize the vector store object
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

In [None]:
query = "who was Benito Mussolini?"

In [None]:
vectorstore.similarity_search(
    query,  # our search query
    k=30  # return 3 most relevant docs
)

#### Generative Question-Answering
In GQA we take the query as a question that is to be answered by a LLM, but the LLM must answer the question based on the information it is seeing being returned from the vectorstore.

To do this we initialize a RetrievalQA object like so:

In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [None]:
# completion llm
llm = ChatOpenAI(
    openai_api_key=api_key,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)


In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="refine",
    retriever=vectorstore.as_retriever()
    k = 3
)

In [None]:
qa.run(query)

We can also include the sources of information that the LLM is using to answer our question. We can do this using a slightly different version of RetrievalQA called RetrievalQAWithSourcesChain:

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain

In [None]:
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
qa_with_sources(query)

In [None]:
pc.delete_index(index_name)

-------------------------
#### Map reduce chain
---------------------