In [2]:
from openai import OpenAI
import os
from datasets import load_dataset
from pinecone import Pinecone
from tqdm.auto import tqdm
from warnings import filterwarnings

filterwarnings("ignore")
os.environ["OPENAI_API_KEY"] = ""

In [3]:
pc = Pinecone(api_key= "")

In [4]:
data = load_dataset("wikipedia", "20220301.simple", split = "train[:10000]")
data

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [5]:
data[6]

{'id': '13',
 'url': 'https://simple.wikipedia.org/wiki/Alan%20Turing',
 'title': 'Alan Turing',
 'text': 'Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dub

In [6]:
import tiktoken
tiktoken.encoding_name_for_model("gpt-3.5-turbo")

'cl100k_base'

In [7]:
tokenizer = tiktoken.get_encoding("cl100k_base")

def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

tiktoken_len("Hello I am a chunk of text and using the tik-token len function we can find blah blah")

19

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 20,
    length_function = tiktoken_len,
    separators = ["\n\n", "\n", " ", ""]
)

In [9]:
chunks = text_splitter.split_text(data[6]["text"])
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

In [11]:
from langchain.embeddings import OpenAIEmbeddings

model_name = "text-embedding-ada-002"
embed = OpenAIEmbeddings(model= model_name)

In [12]:
text = ["hi this is diljyot", "how you doingh"]

res = embed.embed_documents(text)
len(res), len(res[0])

(2, 1536)

In [39]:
index = pc.Index("wiki")

In [15]:
data[0]

{'id': '1',
 'url': 'https://simple.wikipedia.org/wiki/April',
 'title': 'April',
 'text': 'April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other\'s last days are exactly 35 weeks (245 days) apart.\n\nIn co

In [None]:
batch_size = 100
batch = []
with tqdm(total = len(data)) as pbar:
    for i in data:
        texts = text_splitter.split_text(i["text"])
        ids = [f"{i['id']}-{x}"for x in range(len(texts))]
        urls = [i["url"] for _ in range(len(texts))]
        vector = embed.embed_documents(texts)
        for _id, _vector, _url, _text in zip(ids, vector, urls, texts):
            batch.append(
                {
                    "id": _id,
                    "values": _vector,
                    "metadata" : {"source": _url, "text":_text}
                })
        index.upsert(batch)     
        pbar.update(1)

index.describe_index_stats()

In [41]:
data[:27]["title"]

['April',
 'August',
 'Art',
 'A',
 'Air',
 'Autonomous communities of Spain',
 'Alan Turing',
 'Alanis Morissette',
 'Adobe Illustrator',
 'Andouille',
 'Farming',
 'Arithmetic',
 'Addition',
 'Australia',
 'American English',
 'Aquaculture',
 'Abbreviation',
 'Angel',
 'Ad hominem',
 'Native American',
 'Apple',
 'Abrahamic religion',
 'Algebra',
 'Atom',
 'Astronomy',
 'Architecture',
 'Anatomy']

In [27]:
from langchain.vectorstores import Pinecone

text_field = "text" # tell langchain where is our text stored in the metadata

index = pc.Index("wiki")

vectorstore = Pinecone(index, embed.embed_query, text_field)

In [43]:
query = "who is Alan Turing and what is Adobe Illustrator?"

vectorstore.similarity_search(query, k = 5)

[Document(page_content='Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathiso

In [34]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain

llm = ChatOpenAI(
    model= "gpt-3.5-turbo",
    temperature = 0 
)

In [44]:
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever(k=5)
)

In [45]:
print(qa.invoke(query)["result"])

Alan Turing was an English mathematician and computer scientist born in 1912. He is known for his work in breaking the Enigma code during World War II and his contributions to computer science and artificial intelligence. Turing was also a victim of discrimination due to his homosexuality, which led to his tragic death in 1954.

Adobe Illustrator is a computer program developed by Adobe Systems for creating graphic designs and illustrations. It is a vector graphics editor that allows users to create scalable images that can be resized without losing quality. Adobe Illustrator was first released in 1986 for the Apple Macintosh and is part of the Adobe Creative Suite.


In [46]:
qa_source = RetrievalQAWithSourcesChain.from_chain_type(
    llm =llm, 
    chain_type = "stuff",
    retriever = vectorstore.as_retriever()
)

In [48]:
res = qa_source.invoke(query)

In [50]:
print(res["answer"])

Alan Turing was an English mathematician and computer scientist who made significant contributions to computer science and artificial intelligence. Adobe Illustrator is a computer program for graphic design and illustrations made by Adobe Systems.

