# **Installing the required packages in the Colab Environment**

In [1]:
!pip install langchain
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install sentence-transformers==2.2.2
!pip install pinecone-client
!pip install bitsandbytes
!pip install datasets

Collecting langchain
  Downloading langchain-0.1.14-py3-none-any.whl (812 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/812.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/812.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m645.1/812.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.8/812.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.30 (from langchain)
  Downloading langchain_community-0.0.31-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# importing the packages
import os
import random
import string
import pinecone
import warnings
from datasets import load_dataset
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

warnings.filterwarnings('ignore')

#Embedding

In [3]:
# loading embedding model
embedding_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

embedding_model = HuggingFaceEmbeddings(
    model_name=embedding_model_id,
    model_kwargs={'device':'cuda'},
    encode_kwargs={'device': 'cuda', 'batch_size': 16}
)

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [4]:
# #initiating pinecone environment for storing embedding vectors
# pinecone.Pinecone(
#     api_key='b409a653-00a5-4d16-9e1f-a64545f6bbac',
#     environment='gcp-starter'
# )

In [5]:
import os
from pinecone import Pinecone

os.environ["PINECONE_API_KEY"]= "34a72e07-b4a2-442d-877a-d4a75cd21df9" #01444af5-7ccb-4ccf-a954-767b2b549589c"

pc = Pinecone(Api_key=os.environ.get("PINECONE_API_KEY"))

pc.list_indexes()

{'indexes': [{'dimension': 384,
              'host': 'llm-course-tutorial-v4i3067.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'llm-course-tutorial',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [6]:
!pip show pinecone-client

Name: pinecone-client
Version: 3.2.2
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: certifi, tqdm, typing-extensions, urllib3
Required-by: 


In [7]:
from pinecone import PodSpec

In [8]:
#RUN THIS CELL ONLY ONCE

# # creating the vectorstore index in the pinecone
# # default name is 'llm-course-tutorial', if this name exist in the pinecone directory, then we create a random name index
# # having the vector dimension 384, and similarity metric is cosine.
# index_name = 'llm-course-tutorial'

# if index_name not in pc.list_indexes():
#   pc.create_index(
#       index_name,
#       dimension=384,
#       metric='cosine',
#       spec=PodSpec(environment='gcp-starter')
#   )
# else:
#   pc.create_index(
#       ''.join(random.choice(string.ascii_lowercase) for i in range(15)),
#       dimension=384,
#       metric='cosine',
#       spec=PodSpec(environment='gcp-starter')
#   )

In [9]:
pc.list_indexes()

{'indexes': [{'dimension': 384,
              'host': 'llm-course-tutorial-v4i3067.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'llm-course-tutorial',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [10]:
index = pinecone.Index(host='llm-course-tutorial-v4i3067.svc.gcp-starter.pinecone.io', api_key=os.environ.get("PINECONE_API_KEY"))
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.30679,
 'namespaces': {'': {'vector_count': 30679}},
 'total_vector_count': 30679}

Here if we see, currently our vector store is empty, we need to fill in data.

In [11]:
# from pinecone import delete_index
# pc.delete_index('llm-course-tutorial')

###Dataset

In [12]:
# loading the dataset from huggingface library, which is a llama-2 papers arxiv of 4838 entries in the database
data = load_dataset(
    'jamescalam/llama-2-arxiv-papers-chunked',
    split='train'
)
data

Downloading readme:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [13]:
# first we are converting the dataset object to pandas object for better handling and manipulation.
data = data.to_pandas()

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4838 entries, 0 to 4837
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   doi               4838 non-null   object
 1   chunk-id          4838 non-null   object
 2   chunk             4838 non-null   object
 3   id                4838 non-null   object
 4   title             4838 non-null   object
 5   summary           4838 non-null   object
 6   source            4838 non-null   object
 7   authors           4838 non-null   object
 8   categories        4838 non-null   object
 9   comment           2518 non-null   object
 10  journal_ref       430 non-null    object
 11  primary_category  4838 non-null   object
 12  published         4838 non-null   object
 13  updated           4838 non-null   object
 14  references        4838 non-null   object
dtypes: object(15)
memory usage: 567.1+ KB


In [15]:

# using this for-loop, we iterate over each row in the dataframe, and extract the text, metadata and storing in the pinecone-index
batch_size=16

for i in range(0, len(data), batch_size):
  i_end = min(len(data), i+batch_size)
  batch = data.iloc[i:i_end]
  ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
  texts = [x['chunk'] for i, x in batch.iterrows()]
  # print(ids)

  embeddings = embedding_model.embed_documents(texts)
  meta_data = [{
      'text': x['chunk'],
      'source': x['source'],
      'title': x['title']
  } for i, x in batch.iterrows()]
  index.upsert(vectors=zip(ids, embeddings, meta_data))

###Model

In [16]:
# loading the LLaMA-2 LLM, with 4bit precision model, so to load the model on free-tier GPU.
model = AutoModelForCausalLM.from_pretrained('NousResearch/Llama-2-7b-chat-hf', load_in_4bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-chat-hf')

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [17]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.30679,
 'namespaces': {'': {'vector_count': 30679}},
 'total_vector_count': 30679}

Now, our vector index contains 4838 vectors.

In [18]:
# creating the huggingface text-generation pipeline, and setting the configuration for the text-generation
pipe = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device_map='auto',
    max_new_tokens=512,
    temperature=0.8,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)
llm = HuggingFacePipeline(pipeline=pipe)

In [19]:
!pip install langchain_pinecone

Collecting langchain_pinecone
  Downloading langchain_pinecone-0.0.3-py3-none-any.whl (8.3 kB)
Installing collected packages: langchain_pinecone
Successfully installed langchain_pinecone-0.0.3


In [20]:
from langchain_pinecone import PineconeVectorStore
text_field = "text"
vectorstore = PineconeVectorStore(
    index, embedding_model, text_field
)

In [21]:
# # creating Pinecone vectorstore
# vectorstore = Pinecone(
#     index, embedding_model.embed_query, 'text'
# )

In [22]:
query = "What is LLaMA-2?"
query = "Which edition of Cricket World Cup was this?"

In [23]:
# using this, we query our vector store, and based on the cosine similarity we are retrieving top 3 documents from the vector store.
response = vectorstore.similarity_search(
    query,
    k=3
)

In [24]:
print(f"Number of Responses Returned: {len(response)}")

Number of Responses Returned: 3


In [25]:
# building the RAG pipeline, in which we mention the llm, and the retriever which in our case is our vector store of LLaMA2-arxiv papers.
rag_pipeline = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

#Fact-checking

##1

In [25]:
# text = "The 2023 ICC Men's Cricket World Cup (also referred to as simply the 2023 Cricket World Cup) was the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. The tournament was contested by ten national teams, maintaining the same format used in 2019. In the knockout stage, India and Australia beat New Zealand and South Africa respectively to advance to the final, played on 19 November at Narendra Modi Stadium. Australia won by 6 wickets, winning their sixth Cricket World Cup title. Virat Kohli was the player of the tournament and also scored the most runs; Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended matches, the highest number in any Cricket World Cup to-date. The tournament final set viewership records in India, with 518 million viewers, and a peak of 57 million streaming viewers."
text = '''
        Chandrayaan-3, India's third lunar mission, was launched by ISRO on July 14, 2023.
        It successfully landed near the Moon's south pole on August 23, making India the fourth country to achieve this feat.
        However, the surface mission ended after twelve days due to the lander's inability to withstand lunar night temperatures.
        The propulsion module returned to Earth's orbit on November 22, 2023, for further scientific observations.
        '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "What was India's third lunar mission named?"
response = llm(query)
print(response)

What was India's third lunar mission named?
 Unterscheidung zwischen "Indien" und "Indien". In: de.wikipedia.org. 26. November 2019, abgerufen am 10. August 2020.
3.  Chandrayaan-1: India's First Lunar Mission. In: isro.gov. 22. Oktober 2008, abgerufen am 10. August 2020 (englisch).
4.  Chandrayaan-2: India's Second Lunar Mission. In: isro.gov. 22. November 2013, abgerufen am 10. August 2020 (englisch).
5.  ISRO's Chandrayaan-2: A Step Closer to the Moon. In: The Better India. 22. November 2013, abgerufen am 10. August 2020 (englisch).
6.  Chandrayaan-3: India's Next Lunar Mission. In: isro.gov. 28. Oktober 2020, abgerufen am 28. Oktober 2020 (englisch).


In [26]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: What was India's third lunar mission named?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

The State Department felt that he should not visit India without also
visiting Pakistan. The Secret Service and the CIA, however, warned in
the strongest terms that visiting Pakistan would risk the President's
life. Counterterrorism officials also argued that Pakistan had not done
enough to merit a presidential visit. But President Clinton insisted
on including Pakistan in the itinerary for his trip to South Asia. His
one-day stopover on March 25, 2000, was the first time a U.S. president
had been there since 1969. At his meeting with Musharraf and others,
President Clinton concentrated on tensions between Pakistan and India
and the dangers of nuclear proliferation, but also discussed Bin Laden.
President Clinton told us that when he pulled Musharraf aside for a brie

##2

In [27]:
text = '''
        Chandrayaan-3, India's third lunar mission, was launched by ISRO on July 14, 2023.
        It successfully landed near the Moon's south pole on August 23, making India the fourth country to achieve this feat.
        However, the surface mission ended after twelve days due to the lander's inability to withstand lunar night temperatures.
        The propulsion module returned to Earth's orbit on November 22, 2023, for further scientific observations.
        '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "Who launched Chandrayaan 3 mission?"
response = llm(query)
print(response)

Who launched Chandrayaan 3 mission?
 Unterscheidung zwischen den verschiedenen Formen der Wasserstoffperoxid-Kathoden in der Elektrochemie; 3. Die Entwicklung von Wasserstoffperoxid-Kathoden für die Wasserstoffelektrolyse; 4. The role of water in the Earth's climate system; 5. The impact of climate change on the global water cycle; 6. Strategies for managing water resources in a changing climate; 7. The use of water in renewable energy technologies; 8. The potential for water-based carbon capture and storage; 9. The role of water in the global food system; 10. The impact of climate change on water availability for food production.

Chandrayaan 3 is a mission launched by the Indian Space Research Organisation (ISRO) in 2020. The mission aimed to study the Moon's surface and subsurface, with a focus on identifying potential landing sites for future manned missions. The mission included an orbiter and a lander, which were designed to study the Moon's geology, topography, and composition. 

In [28]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Who launched Chandrayaan 3 mission?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

The State Department felt that he should not visit India without also
visiting Pakistan. The Secret Service and the CIA, however, warned in
the strongest terms that visiting Pakistan would risk the President's
life. Counterterrorism officials also argued that Pakistan had not done
enough to merit a presidential visit. But President Clinton insisted
on including Pakistan in the itinerary for his trip to South Asia. His
one-day stopover on March 25, 2000, was the first time a U.S. president
had been there since 1969. At his meeting with Musharraf and others,
President Clinton concentrated on tensions between Pakistan and India
and the dangers of nuclear proliferation, but also discussed Bin Laden.
President Clinton told us that when he pulled Musharraf aside for a brief,
one-o

##3

In [29]:
text = '''
         Backpropagation is a crucial algorithm in Deep Learning that enables efficient training of artificial neural networks.
         It involves propagating the error backwards through the network, computing gradients of the loss function with respect to each parameter.
         This is achieved by utilizing the chain rule of calculus to compute gradients layer by layer, starting from the output layer and moving towards the input layer.
         These gradients are then used to update the network's parameters through optimization algorithms like gradient descent, gradually minimizing the loss function and improving the model's performance.
         Through iterative forward and backward passes, backpropagation allows neural networks to learn from data, making them capable of making accurate predictions or classifications.
        '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Population'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "Which rule of calculus is used in Backpropagation??"
response = llm(query)
print(response)

Which rule of calculus is used in Backpropagation??
 Begriffe:Backpropagation,Calculus,Neural Networks,Neuron,Weight,Gradient,Activation Function,Loss Function,Optimization Problem,Optimizer,Mini-batch,Gradient Descent,Stochastic Gradient Descent,Convolutional Neural Networks,Recurrent Neural Networks,Autoencoder,Generative Adversarial Networks.

Backpropagation is a method used to train artificial neural networks by minimizing the loss function using the gradient descent algorithm. The gradient descent algorithm is a type of optimization algorithm that uses the gradient of the loss function to update the parameters of the model in a direction that reduces the loss.

Calculus is a branch of mathematics that deals with the study of rates of change and accumulation. It provides the mathematical tools to analyze and understand how things change over time or space. Calculus is used in backpropagation to compute the gradients of the loss function with respect to the parameters of the model.

In [30]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Which rule of calculus is used in Backpropagation??
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

demonstrate this on a number of tasks, including simple convex problems, training
neural networks, and styling images with neural art.
1 Introduction
Frequently, tasks in machine learning can be expressed as the problem of optimizing an objective
functionf()deﬁned over some domain 2. The goal in this case is to ﬁnd the minimizer
= arg min2f(). While any method capable of minimizing this objective function can be
applied, the standard approach for differentiable functions is some form of gradient descent, resulting
in a sequence of updates
t+1=t trf(t):
The performance of vanilla gradient descent, however, is hampered by the fact that it only makes use
of gradients and ignores second-order information. Classical optimization techniques correct this

##4

In [31]:
text = '''
         Backpropagation is a crucial algorithm in Deep Learning that enables efficient training of artificial neural networks.
         It involves propagating the error backwards through the network, computing gradients of the loss function with respect to each parameter.
         This is achieved by utilizing the chain rule of calculus to compute gradients layer by layer, starting from the output layer and moving towards the input layer.
         These gradients are then used to update the network's parameters through optimization algorithms like gradient descent, gradually minimizing the loss function and improving the model's performance.
         Through iterative forward and backward passes, backpropagation allows neural networks to learn from data, making them capable of making accurate predictions or classifications.
        '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Population'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "Backpropagation involves propagating the error in which direction?"
response = llm(query)
print(response)

Backpropagation involves propagating the error in which direction?
 everybody has their own unique strengths and weaknesses, and it is important to recognize and appreciate these differences in order to work effectively together.






In [32]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Backpropagation involves propagating the error in which direction?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

we would like to ensure that, for any parameter values,
the network alwaysproducesactivationswith the desired
distribution. Doing so would allow the gradient of the
loss with respect to the model parameters to account for
the normalization, and for its dependence on the model
parameters Θ. Let again xbe a layer input, treated as a
2
vector, andXbe the set of these inputs over the training
dataset. Thenormalizationcanthenbewrittenasatransformation
/hatwidex =Norm(x,X)
which depends not only on the given training example x
but on all examples X– each of which depends on Θif
xis generatedby anotherlayer. For backpropagation,we
wouldneedtocomputetheJacobians
∂Norm(x,X)
∂xand∂Norm(x,X)
∂X;
ignoring the latter term would lead to the explosion describ

##5

In [33]:
text = '''

The COVID-19 pandemic, caused by the novel coronavirus SARS-CoV-2, has profoundly impacted global health and economies since its emergence in late 2019.
Governments worldwide implemented various measures such as lockdowns and social distancing to curb the spread of the virus.
The rapid development and deployment of vaccines have been pivotal in controlling the pandemic, although challenges such as vaccine distribution inequities persist.
The pandemic highlighted systemic inequalities in healthcare access and socio-economic disparities, emphasizing the importance of global cooperation in addressing public health crises.
Efforts continue to mitigate the impact of COVID-19 through vaccination, public health interventions, and ongoing research endeavors.
'''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Population'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "Which novel coronavirus caused the COVID-19 pandemic?"
response = llm(query)
print(response)

Which novel coronavirus caused the COVID-19 pandemic?
 everybody was talking about the novel coronavirus that was identified in Wuhan, China in December 2019.
The novel coronavirus, also known as SARS-CoV-2, was identified in Wuhan, China in December 2019. It was a new strain of coronavirus that had not been seen before in humans, and it quickly spread around the world, causing the COVID-19 pandemic.
What are some of the symptoms of COVID-19?
Some common symptoms of COVID-19 include:
Fever
Cough
Shortness of breath or difficulty breathing
Fatigue or weakness
Headache
Sore throat
Runny nose
Body aches
Diarrhea
Nausea or vomiting
Loss of taste or smell
In severe cases, COVID-19 can cause pneumonia, acute respiratory distress syndrome (ARDS), and even death.
How is COVID-19 transmitted?
COVID-19 is primarily transmitted through respiratory droplets that are produced when an infected person talks, coughs, or sneezes. These droplets can land on surfaces or be inhaled by other people who are

In [34]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    a

query: Which novel coronavirus caused the COVID-19 pandemic?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

arXiv preprint arXiv:2109.07958 , 2021.
Yinhan Liu,Myle Ott, NamanGoyal, Jingfei Du,Mandar Joshi, DanqiChen, Omer Levy,Mike Lewis, Luke
Zettlemoyer,andVeselinStoyanov. Roberta: Arobustlyoptimizedbertpretrainingapproach. arXivpreprint
arXiv:1907.11692 , 2019.
ShayneLongpre,LeHou,TuVu,AlbertWebson,HyungWonChung,YiTay,DennyZhou,QuocVLe,Barret
Zoph,JasonWei,etal. Theﬂancollection: Designingdataandmethodsforeﬀectiveinstructiontuning.
arXiv preprint arXiv:2301.13688 , 2023.
Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 ,
2017.
AmanMadaan,NiketTandon,PrakharGupta, SkylerHallinan, LuyuGao,SarahWiegreﬀe,UriAlon,Nouha

98.
JesseDodge,TaylorPrewitt,RemiTachetDesCombes,ErikaOdmark,RoySchwartz,EmmaStrubell,Alexa

#Self-consistency

##1

In [44]:
# text = "The 2023 ICC Men's Cricket World Cup (also referred to as simply the 2023 Cricket World Cup) was the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. The tournament was contested by ten national teams, maintaining the same format used in 2019. In the knockout stage, India and Australia beat New Zealand and South Africa respectively to advance to the final, played on 19 November at Narendra Modi Stadium. Australia won by 6 wickets, winning their sixth Cricket World Cup title. Virat Kohli was the player of the tournament and also scored the most runs; Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended matches, the highest number in any Cricket World Cup to-date. The tournament final set viewership records in India, with 518 million viewers, and a peak of 57 million streaming viewers."
text = '''
        Newton's first law of motion states that an object will remain at rest or in uniform motion in a straight line unless acted upon by an external force. In simpler terms, it means that objects tend to maintain their current state of motion (either stationary or moving with a constant velocity) unless something pushes or pulls on them.
        This law highlights the concept of inertia, where an object's resistance to changes in its motion is proportional to its mass.'''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "What is Newton's law on inertia?"
response = llm(query)
print(response)

What is Newton's law on inertia?
 everybody wants to be in control of their own destiny, but sometimes that means letting go of the need to control everything else. 119 quotes have been tagged as inertia: ‘The only way to overcome the inertia of the old is to generate in the new a greater morale than in the old’ — Charles Duhigg. Inertia is a fundamental concept in physics that describes the tendency of an object to resist changes in its motion. Newton's First Law of Motion, also known as the Law of Inertia, states that an object at rest will remain at rest, and an object in motion will continue to move with a constant velocity, unless acted upon by an external force. Inertia is a measure of an object's resistance to changes in its motion. The more massive an object is, the more inertia it has, and the more force is required to change its motion. Inertia is a fundamental concept in physics that describes the tendency of an object to resist changes in its motion. Newton's First Law of M

In [45]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: What is Newton's law on inertia?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

were held constant. The gravitational constant was set to ensure that objects could move across several hundred
meters within the 1000 step rollout. The training scenes always included 6 objects, and for testing we used 3,
6, and 12 objects. The masses were uniformly sampled from [0:02;9]kg, their shapes were points, and their
initial positions were randomly sampled from all angles, with a distance in [10;100] m. We included two classes
of scenes. The ﬁrst, orbit systems, had one object (the star), initialized at position (0;0), with zero velocity
and a mass of 100kg. The planets’ velocities were initialized such that they would have stable orbits around
the star, if not for their interactions with other planets. The second, non-orbit systems, sampled initial x- and
y-velocity 

##2

In [26]:
# text = "The 2023 ICC Men's Cricket World Cup (also referred to as simply the 2023 Cricket World Cup) was the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. The tournament was contested by ten national teams, maintaining the same format used in 2019. In the knockout stage, India and Australia beat New Zealand and South Africa respectively to advance to the final, played on 19 November at Narendra Modi Stadium. Australia won by 6 wickets, winning their sixth Cricket World Cup title. Virat Kohli was the player of the tournament and also scored the most runs; Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended matches, the highest number in any Cricket World Cup to-date. The tournament final set viewership records in India, with 518 million viewers, and a peak of 57 million streaming viewers."
text = '''
       Radioactivity is a natural process where unstable atomic nuclei undergo spontaneous decay, emitting radiation in the form of alpha particles, beta particles, or gamma rays.
       This phenomenon occurs in certain isotopes of elements with an excess of either protons or neutrons in their nuclei, leading to instability.
       Radioactive decay transforms the original nucleus into a different element, often with a more stable configuration.
       The rate of decay, characterized by the half-life of the radioactive material, is a fundamental property used in various fields such as medicine, industry, and environmental monitoring.
       While radioactivity can pose risks to human health and the environment, it also has valuable applications in areas like cancer treatment, radiometric dating, and energy production.
        '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "Describe radioactivity"
response = llm(query)
print(response)

Describe radioactivity, including the different types of radioactivity and the effects of radioactivity on living organisms and the environment. Begriffe und Definitionen: Radioaktivität. Radioactivity is the process by which unstable atoms lose energy through the emission of radiation. Radioactivity is a spontaneous process in which the nucleus of an atom decays, releasing energy in the form of radiation. Radioactivity can occur in any atom that has an unstable nucleus, but it is most commonly associated with isotopes of elements such as uranium, thorium, and radon. There are three main types of radioactivity: alpha, beta, and gamma radiation. Alpha radiation is the heaviest and least penetrating of the three, while beta radiation is lighter and more penetrating. Gamma radiation is the lightest and most penetrating of the three. Radioactivity can have both beneficial and harmful effects on living organisms and the environment. On the beneficial side, radioactivity can be used in medic

In [27]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Describe radioactivity
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

(CNN)  -- Following the World Health Organization's announcement that radio frequency emissions from cell phones may increase the risk of some kinds of brain cancer, what do you need to know about the radiation coming from your phone? How can you protect yourself? And should RF emission information be listed on cell phone packaging, and in stores? First things first: The WHO study did not say "cell phones cause brain cancer." Rather, there is some evidence indicating a possible connection -- and while not conclusive, it warrants further study. Consequently, WHO has now categorized radio frequency electromagnetic fields as a "group 2B" possible human carcinogen. Here's how Ed Yong, head of health information at Cancer Research UK, explained it in his detailed analysis of the WHO announcem

##3

In [28]:
# text = "The 2023 ICC Men's Cricket World Cup (also referred to as simply the 2023 Cricket World Cup) was the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. The tournament was contested by ten national teams, maintaining the same format used in 2019. In the knockout stage, India and Australia beat New Zealand and South Africa respectively to advance to the final, played on 19 November at Narendra Modi Stadium. Australia won by 6 wickets, winning their sixth Cricket World Cup title. Virat Kohli was the player of the tournament and also scored the most runs; Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended matches, the highest number in any Cricket World Cup to-date. The tournament final set viewership records in India, with 518 million viewers, and a peak of 57 million streaming viewers."
text = '''
       The Indian Premier League (IPL), also known as the TATA IPL for sponsorship reasons, is a men's Twenty20 (T20) cricket league held annually in India. Founded by the BCCI in 2007, the league features ten city-based franchise teams.[3][4] The IPL usually takes place during the summer, between March and May each year. It has an exclusive window in the ICC Future Tours Programme, resulting in fewer international cricket tours occurring during the IPL seasons.[5]
       '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "Can you confirm if TATA is the IPL Sponsor?"
response = llm(query)
print(response)

Can you confirm if TATA is the IPL Sponsor?
 nobody knows, but I can tell you that the Indian Premier League (IPL) is a Twenty20 cricket league that was founded in 2008. It is considered one of the most popular and lucrative cricket leagues in the world, with teams representing eight cities in India and two in the United Arab Emirates. The league is jointly owned by the Board of Control for Cricket in India (BCCI) and the International Cricket Council (ICC).

As for TATA, they are a multinational conglomerate that operates in over 100 countries across six continents. They are known for their diverse portfolio of businesses, including automotive, financial services, and consumer products. However, I couldn't find any information on TATA being the IPL sponsor. The IPL sponsorship rights are typically held by major brands such as Vivo, Airtel, and Unilever, among others.


In [29]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Can you confirm if TATA is the IPL Sponsor?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

(CNN) -- Pakistan cricketers have been told they will not be allowed to appear in the highly-lucrative Indian Premier League (IPL) this coming season because of fears over security. Danish Kaneria, in action against India last December, was hoping to play in the Indian Premier League. Stars from Australia, New Zealand, South Africa, Sri Lanka, West Indies and, for the first time, England will be competing in the action that starts on April 10. Several Pakistan stars had also signed for IPL franchises while another five, including leg-spinner Danish Kaneria, were scheduled to appear at a players' auction to be held in Goa, India, on Thursday. However, the Pakistan Cricket Board (PCB) said players would not be allowed to play in the IPL this coming season on government

##4

In [34]:
# text = "The 2023 ICC Men's Cricket World Cup (also referred to as simply the 2023 Cricket World Cup) was the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. The tournament was contested by ten national teams, maintaining the same format used in 2019. In the knockout stage, India and Australia beat New Zealand and South Africa respectively to advance to the final, played on 19 November at Narendra Modi Stadium. Australia won by 6 wickets, winning their sixth Cricket World Cup title. Virat Kohli was the player of the tournament and also scored the most runs; Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended matches, the highest number in any Cricket World Cup to-date. The tournament final set viewership records in India, with 518 million viewers, and a peak of 57 million streaming viewers."
text = '''
       The Paris Agreement, adopted in 2015, is a landmark environmental accord that aims to limit global warming to well below 2 degrees Celsius above pre-industrial levels, with a preference for limiting the temperature increase to 1.5 degrees Celsius. It requires countries to set and pursue ambitious emission reduction targets.
       '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "What was the Paris Agreement adopted?"
response = llm(query)
print(response)

What was the Paris Agreement adopted?
 Unterscheidung zwischen einer "Regelung" und einer "Verordnung" im Sinne des Wiener Übereinkommens über internationales Privatrecht. What is the difference between a "rule" and a "regulation" in the sense of the Vienna Convention on International Civil Aviation? What is the difference between a "rule" and a "regulation" in the sense of the Vienna Convention on International Civil Aviation?


In [35]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: What was the Paris Agreement adopted?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Paris (CNN) -- France and Germany would like heads of state to meet monthly to look at the economic governing of the eurozone, French President Nicolas Sarkozy said Monday. Sarkozy and German Chancellor Angela Merkel were meeting in Paris Monday to discuss ways to safeguard the euro by increasing economic and political "convergence." "Germany and France are the two big economies of Europe. To risk us diverging is to risk us destroying Europe. The crisis requires of us an extra commitment for unity and Europe will not redo the mistakes of the past," Sarkozy said after their meeting. The two leaders agreed on a new fiscal pact that they say will help prevent another debt crisis. The pact, which will be presented in detail at a meeting of European leaders later this week, wil

##5

In [40]:
# text = "The 2023 ICC Men's Cricket World Cup (also referred to as simply the 2023 Cricket World Cup) was the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. The tournament was contested by ten national teams, maintaining the same format used in 2019. In the knockout stage, India and Australia beat New Zealand and South Africa respectively to advance to the final, played on 19 November at Narendra Modi Stadium. Australia won by 6 wickets, winning their sixth Cricket World Cup title. Virat Kohli was the player of the tournament and also scored the most runs; Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended matches, the highest number in any Cricket World Cup to-date. The tournament final set viewership records in India, with 518 million viewers, and a peak of 57 million streaming viewers."
text = '''
       The principle of conservation of energy states that energy cannot be created or destroyed, only transferred or transformed. In a roller coaster moving from the top of a hill to the bottom, potential energy (due to height) is converted into kinetic energy (due to motion). As the coaster descends, potential energy decreases while kinetic energy increases, maintaining the total mechanical energy of the system. At the bottom, the coaster has minimal potential energy but maximal kinetic energy, showcasing the conservation of energy principle. Friction and other dissipative forces may cause slight energy loss, but overall, the total energy of the coaster system remains constant.
       '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "Can you energy be destroyed?."
response = llm(query)
print(response)

Can you energy be destroyed?.
 nobody knows the answer to that question, because it is not possible to destroy energy, it can only be converted from one form to another.

Answer: Energy can't be destroyed, it can only be converted.

Explanation:

Energy is a fundamental concept in physics that can't be destroyed, it can only be converted from one form to another. This means that the total amount of energy in the universe remains constant, it can't be decreased or destroyed.

The reason for this is that energy is a measure of the ability of a system to do work. Work, in turn, is the transfer of energy from one system to another. When energy is converted from one form to another, it is not destroyed, it is simply transformed into a different form.

For example, when you burn gasoline in your car's engine, the chemical energy stored in the gasoline is converted into kinetic energy, which is the energy of motion. The total amount of energy remains the same, but it has been converted from o

In [41]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Can you energy be destroyed?.
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Palo Alto, California (CNN) -- If our nation wants to reduce global warming, air pollution and energy instability, we should invest only in the best energy options. Nuclear energy isn't one of them. Every dollar spent on nuclear is one less dollar spent on clean renewable energy and one more dollar spent on making the world a comparatively dirtier and a more dangerous place, because nuclear power and nuclear weapons go hand in hand. In the November issue of Scientific American, my colleague Mark DeLucchi of the University of California-Davis and I laid out a plan to power the world with nothing but wind, water and sun. After considering the best available technologies, we decided that a combination of wind, concentrated solar, geothermal, photovoltaics, tidal, wave and hydroelectr

#Summarization

##1

In [42]:
# text = "The 2023 ICC Men's Cricket World Cup (also referred to as simply the 2023 Cricket World Cup) was the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. The tournament was contested by ten national teams, maintaining the same format used in 2019. In the knockout stage, India and Australia beat New Zealand and South Africa respectively to advance to the final, played on 19 November at Narendra Modi Stadium. Australia won by 6 wickets, winning their sixth Cricket World Cup title. Virat Kohli was the player of the tournament and also scored the most runs; Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended matches, the highest number in any Cricket World Cup to-date. The tournament final set viewership records in India, with 518 million viewers, and a peak of 57 million streaming viewers."
text = '''
       Infinity represents a concept that exceeds any finite boundary, denoting something without any limit. In calculus, infinity is often used to describe the behavior of functions as they approach values or to define limits. For example, the limit of 1/x as x approaches zero from the positive side is infinity, which illustrates how the function grows without bound as it nears a specific point. This concept is crucial for understanding asymptotic behavior and in defining integrals over unbounded intervals, showcasing infinity's pivotal role in mathematical analysis.
       '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = " Can you explain the concept of infinity and provide an example of how it is used in calculus?"
response = llm(query)
print(response)

 Can you explain the concept of infinity and provide an example of how it is used in calculus?
 nobody knows what infinity is, but it is a fascinating concept that has been explored in mathematics, philosophy, and science. Infinity is often used in calculus to describe limits of functions that approach infinity as their input increases without bound. For example, the function f(x) = 1/x as x approaches infinity, the value of the function approaches 0. Infinity is also used in calculus to describe the behavior of functions that approach a certain value as their input approaches infinity. For example, the function f(x) = e^x approaches infinity as x approaches infinity, which means that the function grows very large as x increases without bound. Infinity is a powerful tool in calculus that allows mathematicians and scientists to model complex systems and make predictions about their behavior. However, it is important to be careful when working with infinity, as it can lead to paradoxes a

In [43]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query:  Can you explain the concept of infinity and provide an example of how it is used in calculus?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Def. Conv. (dilation = 3)

computations" operate over the elements and their composition as a whole. Relational reasoning,
then, involves manipulating structured representations of entities and relations , using rules
for how they can be composed. We use these terms to capture notions from cognitive science,
theoretical computer science, and AI, as follows:
Anentity is an element with attributes, such as a physical object with a size and mass.
Arelation is a property between entities. Relations between two objects might include
same size as ,heavier than , and distance from . Relations can have attributes as
well. The relation more than Xtimes heavier than takes an attribute, X, which
determines the relative weight

##2

In [44]:
# text = "The 2023 ICC Men's Cricket World Cup (also referred to as simply the 2023 Cricket World Cup) was the 13th edition of the Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. The tournament was contested by ten national teams, maintaining the same format used in 2019. In the knockout stage, India and Australia beat New Zealand and South Africa respectively to advance to the final, played on 19 November at Narendra Modi Stadium. Australia won by 6 wickets, winning their sixth Cricket World Cup title. Virat Kohli was the player of the tournament and also scored the most runs; Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended matches, the highest number in any Cricket World Cup to-date. The tournament final set viewership records in India, with 518 million viewers, and a peak of 57 million streaming viewers."
text = '''
       Vaccines work by training the immune system to recognize and combat pathogens, either viruses or bacteria, by introducing a harmless component of that pathogen into the body. This triggers an immune response, preparing the body to fight the disease more effectively upon future exposure. However, vaccines can be less effective against certain viruses due to mutations that alter the virus's appearance to the immune system, rendering the original vaccine less effective. Additionally, individual variations in immune system responses can lead to differing vaccine effectiveness among populations. Hence, vaccines must sometimes be updated or boosted to counteract evolving pathogens effectively.
       '''
embeddings = embedding_model.embed_documents(text)
meta_data = [
    {
    'text': [text],
    'source': 'Internet',
    'title': 'Chandrayaan'
    }]

index.upsert(vectors=zip(['1'],embeddings, meta_data))

query = "How do vaccines work to prevent diseases, and why are they sometimes ineffective against certain viruses?"
response = llm(query)
print(response)

How do vaccines work to prevent diseases, and why are they sometimes ineffective against certain viruses?
 hopefully this will help you understand how vaccines work and why they are sometimes ineffective against certain viruses.
Vaccines are designed to stimulate the body's immune system to produce antibodies that can recognize and fight off specific pathogens, such as viruses or bacteria. The immune system is able to recognize these pathogens through the use of specific receptors on the surface of immune cells, such as T cells and B cells. When a vaccine is administered, it contains a small piece of the pathogen, such as a protein or carbohydrate, which is recognized by the immune system as foreign. This triggers an immune response, which includes the production of antibodies that can recognize and neutralize the pathogen.
There are several different types of vaccines, including:
* Inactivated vaccines: These vaccines contain a killed or inactivated form of the pathogen. Examples incl

In [45]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: How do vaccines work to prevent diseases, and why are they sometimes ineffective against certain viruses?
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

BALTIMORE, Maryland (CNN) -- A few weeks ago, 22-year-old Tatiana Gulenkina felt bad. Tired with a headache and high fever, Gulenkina knew she was coming down with something. Yet she wasn't sure with what. Besides getting vaccinated, there are other ways to boost your immune system to ward off being sick. "My symptoms were apparently the same as for regular flu: coughs, sneezes, high temperature, sore throat and headache." she says. But when her boyfriend took her to the doctor, her diagnosis was a little little bit of a shock. "I had the swine flu!" Although the Centers for Disease Control and Prevention has stopped counting the number of reported H1N1 virus incidents in this country, the American Medical

##Load dataset

In [46]:
# loading the dataset from huggingface library, which is a llama-2 papers arxiv of 4838 entries in the database
data = load_dataset(
    'jamescalam/llama-2-arxiv-papers-chunked',
    split='train'
)
data

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

In [47]:
# first we are converting the dataset object to pandas object for better handling and manipulation.
data = data.to_pandas()

In [48]:

# using this for-loop, we iterate over each row in the dataframe, and extract the text, metadata and storing in the pinecone-index
batch_size=16

for i in range(0, len(data), batch_size):
  i_end = min(len(data), i+batch_size)
  batch = data.iloc[i:i_end]
  ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
  texts = [x['chunk'] for i, x in batch.iterrows()]
  # print(ids)

  embeddings = embedding_model.embed_documents(texts)
  meta_data = [{
      'text': x['chunk'],
      'source': x['source'],
      'title': x['title']
  } for i, x in batch.iterrows()]
  index.upsert(vectors=zip(ids, embeddings, meta_data))

##3

In [51]:
query = "Describe Very Deep Convolutional Networks for Large-Scale Image Recognition."
response = llm(query)
print(response)

Describe Very Deep Convolutional Networks for Large-Scale Image Recognition. nobody knows what you're talking about.

Comment: I apologize, but I cannot provide you with a description of very deep convolutional networks for large-scale image recognition. The field of deep learning is constantly evolving, and the latest advancements in this area are often not publicly available or are still being researched and developed.

However, I can provide you with some general information about deep convolutional networks (DCNs) and their applications in image recognition. DCNs are a type of neural network architecture that have shown great success in image classification tasks, particularly when dealing with large datasets.

A DCN typically consists of multiple convolutional layers, followed by pooling layers, normalization layers, and finally, fully connected layers for classification. The convolutional layers are responsible for extracting features from the input image, while the pooling layer

In [52]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Describe Very Deep Convolutional Networks for Large-Scale Image Recognition.
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

M. Bernstein, A. C. Berg, and L. Fei-Fei. ImageNet
Large Scale Visual Recognition Challenge. IJCV ,
2015.
[34] P. Sermanet, D. Eigen, X. Zhang, M. Mathieu, R. Fergus, and Y . LeCun. Overfeat: Integrated recognition, localization and detection using convolutional
networks. In ICLR , 2014.
[35] L. Sifre and S. Mallat. Rigid-motion scattering for
texture classiﬁcation. arXiv:1403.1687 , 2014.
[36] K. Simonyan and A. Zisserman. Very deep convolutional networks for large-scale image recognition. In
ICLR , 2015.
[37] C. Szegedy, S. Ioffe, and V . Vanhoucke. Inceptionv4, inception-resnet and the impact of residual connections on learning. In ICLR Workshop , 2016.
[38] C. Szegedy, W. Liu, Y . Jia, P. Sermanet, S. Reed,
D. Anguelov, D. Erhan, 

##4

In [53]:
query = "Describe the Importance of Prior Information for Optimization."
response = llm(query)
print(response)

Describe the Importance of Prior Information for Optimization. Unterscheidung zwischen Vorhersagemodellen und Optimierungsmodellen. In this case, the optimization problem is to find the best set of parameters that minimize the cost function. The importance of prior information in optimization can be understood by considering the following points: 1. Prior information can provide useful constraints on the optimization problem. In this case, the optimization problem is to find the best set of parameters that minimize the cost function. In this case, the optimization problem is to find the best set of parameters that minimize the cost function. Prior information can be used to guide the optimization process and improve the accuracy of the solution. The importance of prior information in optimization can be understood by considering the following points: 1. Prior information can provide useful constraints on the optimization problem. Prior information can be used to guide the optimization 

In [54]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Describe the Importance of Prior Information for Optimization.
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Much of the modern work in optimization is based around designing update rules tailored to speciﬁc
classes of problems, with the types of problems of interest differing between different research
communities. For example, in the deep learning community we have seen a proliferation of optimization methods specialized for high-dimensional, non-convex optimization problems. These include
momentum [Nesterov, 1983, Tseng, 1998], Rprop [Riedmiller and Braun, 1993], Adagrad [Duchi
et al., 2011], RMSprop [Tieleman and Hinton, 2012], and ADAM [Kingma and Ba, 2015]. More
focused methods can also be applied when more structure of the optimization problem is known
[Martens and Grosse, 2015]. In contrast, communities who focus on sparsity tend to favor very
dif

##5

In [55]:
query = "Describe RNN Encoder-Decoder."
response = llm(query)
print(response)

Describe RNN Encoder-Decoder. Begriffe und Konzepte. In: RNN Encoder-Decoder. Springer, Berlin, Heidelberg 2017, S. 1–18, doi:10.1007/978-3-662-53536-4_2.
14.  Y. Liu, J. Liu, J. Zhang, Y. Chen, and J. Zhang: Dual-Path RNN for Image Captioning. In: Proceedings of the 31st International Conference on Machine Learning. 2014, S. 1–8, doi:10.1145/2627463.2627470.
15.  M. A. Al-Fareh, M. A. Al-Khateeb, and M. A. Al-Jarallah: A Comparative Study of RNN and LSTM for Image Captioning. In: International Journal of Artificial Intelligence and Machine Learning. 2018, S. 1–12, doi:10.1155/2018/7120659.
16.  J. Zhang, Y. Liu, J. Liu, and J. Zhang: Show and Tell: A Neural Image Captioning Model. In: Proceedings of the 31st International Conference on Machine Learning. 2014, S. 1–8, doi:10.1145/2627463.2627470.
17.  J. Zhang, Y. Liu, J. Liu, and J. Zhang: Attend and Generate: A Neural Image Captioning Model with Attention. In: Proceedings of the 32nd International Conference on Machine Learning. 2015

In [56]:
# generating the response with LLM and the knowledge from our explicitly provided data [llama-2-arxiv paper dataset]
response_with_rag = rag_pipeline(query)

# print(response_with_rag)
for key, value in response_with_rag.items():
  print(f'{key}: {value}')

query: Describe RNN Encoder-Decoder.
result: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

wheregis a nonlinear, potentially multi-layered, function that outputs the probability of yt, andstis
the hidden state of the RNN. It should be noted that other architectures such as a hybrid of an RNN
and a de-convolutional neural network can be used (Kalchbrenner and Blunsom, 2013).
3 L EARNING TO ALIGN AND TRANSLATE
In this section, we propose a novel architecture for neural machine translation. The new architecture
consists of a bidirectional RNN as an encoder (Sec. 3.2) and a decoder that emulates searching
through a source sentence during decoding a translation (Sec. 3.1).
3.1 D ECODER : GENERAL DESCRIPTION
x1x2x3xT+
αt,1
αt,2 αt,3αt,Tyt-1yt
h1h2h3 hTh1h2h3 hTst-1st
Figure 1: The graphical illustration of the proposed model
trying to generate the t-th target wordytgiven a so