In [None]:
#pip install datasets

In [1]:
from datasets import load_dataset

In [2]:
data = load_dataset("wikipedia", "20220301.simple", split='train[:10]', trust_remote_code=True)
data

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10
})

In [3]:
import pandas as pd

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,id,url,title,text
0,1,https://simple.wikipedia.org/wiki/April,April,April is the fourth month of the year in the J...
1,2,https://simple.wikipedia.org/wiki/August,August,August (Aug.) is the eighth month of the year ...
2,6,https://simple.wikipedia.org/wiki/Art,Art,Art is a creative activity that expresses imag...
3,8,https://simple.wikipedia.org/wiki/A,A,A or a is the first letter of the English alph...
4,9,https://simple.wikipedia.org/wiki/Air,Air,Air refers to the Earth's atmosphere. Air is a...
5,12,https://simple.wikipedia.org/wiki/Autonomous%2...,Autonomous communities of Spain,Spain is divided in 17 parts called autonomous...
6,13,https://simple.wikipedia.org/wiki/Alan%20Turing,Alan Turing,"Alan Mathison Turing OBE FRS (London, 23 June ..."
7,14,https://simple.wikipedia.org/wiki/Alanis%20Mor...,Alanis Morissette,"Alanis Nadine Morissette (born June 1, 1974) i..."
8,17,https://simple.wikipedia.org/wiki/Adobe%20Illu...,Adobe Illustrator,Adobe Illustrator is a computer program for ma...
9,18,https://simple.wikipedia.org/wiki/Andouille,Andouille,Andouille is a type of pork sausage. It is spi...


In [6]:
df.to_csv('wikipedia_data.csv', index=False)

In [7]:
data[6]

{'id': '13',
 'url': 'https://simple.wikipedia.org/wiki/Alan%20Turing',
 'title': 'Alan Turing',
 'text': 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dub

In [8]:
#!pip install langchain

In [9]:
#!pip install openai

In [10]:
#!pip install pinecone-client

In [11]:
#!pip install tiktoken

**Pre-processing of the text**

- chunking and related information

In [12]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [13]:
tokenizer = tiktoken.get_encoding('cl100k_base')

In [14]:
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [15]:
tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

26

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size     = 400,
    chunk_overlap  = 20,
    length_function= tiktoken_len,
    separators     = ["\n\n", "\n", " ", ""]
)

In [18]:
chunks = text_splitter.split_text(data[6]['text'])[:3]
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

In [19]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])

(299, 323, 382)

In [20]:
from langchain_openai import OpenAIEmbeddings

In [21]:
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model = model_name,
    #openai_api_key=api_key
)

In [22]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

In [23]:
res = embed.embed_documents(texts)
len(res), len(res[0])

(2, 1536)

#### Vector database

In [26]:
import os
from pinecone import Pinecone

In [27]:
pc_api_key = os.getenv('PINECONE_API_KEY')

In [29]:
# configure client
pc = Pinecone(api_key=pc_api_key)

In [30]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud = "aws", 
    region= "us-east-1"
)

In [31]:
index_name = 'langchain-retrieval-augmentation'

existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

In [32]:
existing_indexes

[]

In [33]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension = 1536,  # dimensionality of ada 002
        metric    = 'dotproduct',
        spec      = spec
    )
    
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(2)

In [35]:
import time

In [36]:
# connect to index
index = pc.Index(index_name)

time.sleep(1)

In [37]:
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [38]:
len(data)

10

In [39]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts     = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source':  record['url'],
        'title':   record['title']
    }
    
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids    = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        
        index.upsert(vectors=zip(ids, embeds, metadatas))
        
        texts     = []
        metadatas = []

if len(texts) > 0:
    ids    = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/10 [00:00<?, ?it/s]

In [40]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 40}},
 'total_vector_count': 40}

#### Create the vector store and apply some querying
- vector store is NOT = index
- vector store (LLM frameworks) - include the indexes

In [41]:
from langchain.vectorstores import Pinecone

In [44]:
text_field = 'text'

In [45]:
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

  vectorstore = Pinecone(


In [46]:
query = "who was Benito Mussolini?"

In [47]:
vectorstore.similarity_search(
    query,  # our search query
    k =3    # return 3 most relevant docs
)

[Document(metadata={'chunk': 1.0, 'source': 'https://simple.wikipedia.org/wiki/Alan%20Turing', 'title': 'Alan Turing', 'wiki-id': '13'}, page_content='A brilliant mathematician and cryptographer Alan was to become the founder of modern-day computer science and artificial intelligence; designing a machine at Bletchley Park to break secret Enigma encrypted messages used by the Nazi German war machine to protect sensitive commercial, diplomatic and military communications during World War 2. Thus, Turing made the single biggest contribution to the Allied victory in the war against Nazi Germany, possibly saving the lives of an estimated 2 million people, through his effort in shortening World War II.\n\nIn 2013, almost 60 years later, Turing received a posthumous Royal Pardon from Queen Elizabeth II. Today, the “Turing law” grants an automatic pardon to men who died before the law came into force, making it possible for living convicted gay men to seek pardons for offences now no longer on

#### Generative QnA
- Documents retrieval from vector store or database
- RetrieveQA sends these documents (plus the question) to the LLM
- LLM then generates a cohesive answer



In [50]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

In [51]:
# completion llm
llm = ChatOpenAI(
    #openai_api_key=api_key,
    model_name = 'gpt-3.5-turbo',
    temperature= 0.0
)


In [54]:
qa = RetrievalQA.from_chain_type(
    llm       = llm,
    chain_type= "stuff",
    retriever = vectorstore.as_retriever()
)

In [55]:
qa.run(query)

'Benito Mussolini was an Italian politician and leader who founded the Fascist Party in Italy, ruling as Prime Minister from 1922 until his ousting in 1943. He was a key figure in the creation of Fascism, a totalitarian ideology that sought to create a centralized, authoritarian state. Mussolini allied Italy with Nazi Germany during World War II.'