In [2]:
import re
from pinecone import Pinecone, ServerlessSpec
from langchain import  hub
from langchain_community.llms import Ollama
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents.base import Document
from constants import PINECONE_API_KEY, GOOGLE_API_KEY

## Load and Preprocess the Data

In [3]:
PDF_PATH = 'data/An_Introduction_to_Space_Exploration_TPDas.pdf'

In [4]:
# Define a function to preprocess text
def preprocess_text(text):
    # Replace consecutive spaces, newlines and tabs
    text = re.sub(r'\s+', ' ', text)
    return text



# Define a function to load the PDF and process the text
def process_pdf(file_path):
    # create a pdf loader
    loader = PyPDFLoader(file_path)
    # load your data
    data = loader.load()
    # Split your data into smaller documents with Chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    documents = text_splitter.split_documents(data)
    # Convert Document objects into strings
    texts = [preprocess_text(doc.page_content) for doc in documents]
    return texts

In [5]:
texts = process_pdf(PDF_PATH)

In [6]:
len(texts)

199

In [7]:
type(texts[50])

str

In [8]:
texts[20]

'10 cosmos. Laboratory studies of meteoroids and analysis of returned samples further enrich our knowledge base, offering insights into extra -terrestrial materials. This triad of modelling, simulation, and ob servation functions as a continuous cycle. Models are refined and validated through simulations, aligning them with real - world observations. Observations, in turn, inform and improve models and simulations, creating a symbiotic relationship that propels sc ientific discovery. This iterative process is fundamental to advancing our comprehension of space, fostering a holistic and evolving understanding of the universe. At this juncture, we have to remember that all the space research we do, are from the perspective of th e Earth. Thus, our discussion on space should start from an understanding of the Earth. Earth We, the Earthlings, study space, in general, from the perspective of Earth. This necessitates understanding the'

## Create the text Embedding

In [16]:
# Define a function to create embeddings
def create_embeddings(texts):
    return GPT4AllEmbeddings().embed_documents(texts=texts)

In [17]:
embeddings = create_embeddings(texts)

In [18]:
embeddings

[[0.06757830083370209,
  -0.0014203997561708093,
  -0.009418759495019913,
  0.039185624569654465,
  -0.11130187660455704,
  0.001261185621842742,
  0.05152640864253044,
  0.06042821332812309,
  -0.01965845189988613,
  0.06731638312339783,
  -0.018751850351691246,
  -0.014509358443319798,
  -0.007748221512883902,
  0.004976609721779823,
  -0.06219692900776863,
  0.028518596664071083,
  0.006324948742985725,
  -0.043545179069042206,
  0.032187569886446,
  -0.008911081589758396,
  -0.033034294843673706,
  -0.02237018756568432,
  0.009822426363825798,
  -0.009727166965603828,
  -0.09556970745325089,
  0.04467947781085968,
  0.026292935013771057,
  -0.008590245619416237,
  0.08151644468307495,
  -0.0007592612528242171,
  -0.005442642606794834,
  0.0862206518650055,
  0.02665402926504612,
  -0.030199434608221054,
  -0.005703269504010677,
  0.11541562527418137,
  -0.005953015759587288,
  -0.03376762941479683,
  -0.05742054805159569,
  0.02170766331255436,
  -0.05110638961195946,
  0.003305809

In [19]:
len(embeddings)

199

In [20]:
len(embeddings[1]) # so we have embedding of dimension 384

384

## Initialize Pinecone Vector Database and create an index for the texts

In [21]:
# Initialize Pinecone
pinecone = Pinecone(api_key=PINECONE_API_KEY, environment='gcp-starter')

In [22]:
index_name='space-gk-index'

In [63]:
existing_indexes = [index['name'] for index in pinecone.list_indexes()]
existing_indexes

[]

In [64]:
# Create the index if it doesn't exist

if index_name in existing_indexes:
    pinecone.delete_index(index_name)
    
pinecone.create_index(name=index_name,
                        dimension=384,
                        metric="cosine",
                        spec=ServerlessSpec(
                            cloud='aws', 
                            region='us-east-1'
                        )) 

# Instantiate the index
index = pinecone.Index(index_name)

# wait for index to be initialized
while not pinecone.describe_index(index_name).status['ready']:
    time.sleep(1)
print('created')

created


In [65]:
def create_documents(texts, embeddings):
    documents = []
    for i, text in enumerate(texts):
        text_dict = {'id': 'item-'+str(i), 
                     'metadata': {
                         'text': text},
                     'values': embeddings[i]}
        documents.append(text_dict)
    return documents

In [66]:
documents = create_documents(texts, embeddings)
documents[10]

{'id': 'item-10',
 'metadata': {'text': "5 What is Space? There may not be a unique answer ust ask ten peopl e what they mean by the term ‘Space’, and, quite surprisingly, you may end up receiving ten different types of answers. I must say, all the ten answers you received may be correct in their own merit. It all depends on the perspective. Scientists from diffe rent backgrounds perceive the notion of space through the lens of their respective fields, bringing unique perspectives to its understanding. As for example, a n atmospheric scientist views space as the region beyond the Earth's atmosphere where the interac tions between celestial bodies and the effects of solar radiation occur. They study how space weather and phenomena such as solar flares impact our planet's atmospheric conditions. For a mechanical engineer, space presents itself as a challenging environme nt with harsh conditions. They focus on designing spacecraft and technologies that can"},
 'values': [-0.02725607156753

In [27]:
# Define a function to upsert embeddings to Pinecone
def upsert_embeddings_to_pinecone(index, docs):
    index.upsert(vectors=docs)

In [28]:
# Add the Embeddings to the Pinecone
upsert_embeddings_to_pinecone(index, documents)

In [None]:
# We can check in pinecone website whether texts are added to the index

## Run a Query on this index

In [39]:
query = 'What is a dwarf planet?'

In [40]:
# Create query embedding
query_embedding = GPT4AllEmbeddings().embed_query(query)
query_embedding

[0.030476681888103485,
 0.02562454715371132,
 -0.01982858031988144,
 0.05835477262735367,
 0.014078996144235134,
 -0.07983614504337311,
 -0.005926687736064196,
 -0.03590159863233566,
 0.02478088065981865,
 0.054306983947753906,
 -0.02395416796207428,
 -0.05509546399116516,
 0.014911163598299026,
 -0.06625864654779434,
 -0.012424777261912823,
 -0.04459112882614136,
 -0.056072529405355453,
 -0.02539048157632351,
 0.11098939180374146,
 0.0028586024418473244,
 0.004904838744550943,
 0.069004125893116,
 -0.034362759441137314,
 0.047841645777225494,
 -0.0020457031205296516,
 -0.04985664039850235,
 -0.07728971540927887,
 0.010616758838295937,
 -0.03679530695080757,
 -0.033445797860622406,
 0.0034777058754116297,
 0.11383455246686935,
 0.06131242960691452,
 0.027567269280552864,
 -0.07077920436859131,
 0.07647380232810974,
 0.004943009000271559,
 -0.008975512348115444,
 -0.059496331959962845,
 -0.006127191241830587,
 -0.016731323674321175,
 -0.0502578504383564,
 0.0687495768070221,
 -0.0152524

In [41]:
len(query_embedding) # as it is a 384 dimensional like each document

384

In [42]:
results = index.query(
    # namespace="ns1", # we have not defined namespace here
    vector=query_embedding,
    top_k=3,
    include_values=True,
    include_metadata=True
)

In [43]:
results

{'matches': [{'id': 'item-65',
              'metadata': {'text': '3. Dwarf Planets: Dwarf planets are '
                                   'similar to planets but have not fully '
                                   'cleared their orbits of other debris. The '
                                   'most famous dwarf planet is Pluto. Other '
                                   'recognized dwarf planets include Eris, '
                                   'Haumea, Makemake, and Ceres (located in '
                                   'the asteroid belt). 4. Comets: Comets are '
                                   'icy bodies composed of dust, rock, and '
                                   'frozen gases. When they approach the'},
              'score': 0.664946795,
              'values': [0.0534347519,
                         -0.0244639106,
                         0.0397021249,
                         -0.0169078615,
                         0.0853501707,
                         -0.0564496368,

In [44]:
results['matches'][0]['metadata']['text']

'3. Dwarf Planets: Dwarf planets are similar to planets but have not fully cleared their orbits of other debris. The most famous dwarf planet is Pluto. Other recognized dwarf planets include Eris, Haumea, Makemake, and Ceres (located in the asteroid belt). 4. Comets: Comets are icy bodies composed of dust, rock, and frozen gases. When they approach the'