In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets

# 'datasets' is a package that provides easy access to many popular datasets for natural language processing (NLP) tasks.

In [None]:
!pip install pinecone-client

# 'pinecone-client' is a Python library that provides a client for Pinecone, a cloud-based vector database service. 

In [None]:
pip install sentence-transformers

# 'sentence-transformers' is a Python package that provides pre-trained models for generating fixed-length dense embeddings for sentences or paragraphs. These embeddings can be used to perform similarity searches, clustering, or other NLP tasks.

In [None]:
pip install torch

# 'torch' is a Python package that provides support for tensor computations and deep learning. It is widely used in the machine learning community for building neural networks and training models.

In [None]:
from datasets import load_dataset

df = load_dataset("squad", split="train").to_pandas()
df = df[["title", "context"]]
df = df.drop_duplicates(subset="context")
df

#This code loads the squad dataset using the load_dataset function from the datasets package. The split="train" argument specifies that we want to load the training split of the dataset.

#The loaded dataset is then converted to a Pandas DataFrame using the .to_pandas() method. The resulting DataFrame contains two columns: "title" and "context".

#The next line of code removes any duplicate rows in the DataFrame, based on the values in the "context" column. This is done using the .drop_duplicates() method.

#Finally, the cleaned DataFrame is assigned back to the variable df.

In [None]:
import pinecone
pinecone.init(
    api_key="2832266e-7ea5-473a-b641-1afd353ff435",
    environment="us-west1-gcp-free"
)

#This code initializes the Pinecone client by calling the init() function from the pinecone package.

#The api_key parameter is set to a string value that represents the API key for your Pinecone account. This API key is used to authenticate your requests to the Pinecone service.

#The environment parameter is set to a string value that represents the name of the environment where your Pinecone index will be created. In this case, it is set to "us-west1-gcp-free", which is a free tier environment provided by Google Cloud Platform.

#Once initialized, you can use the Pinecone client to create an index and add vectors to it, as well as query the index and retrieve similar vectors.

In [None]:
index_name = "extractive-question-answering"


if index_name not in pinecone.list_indexes():

    pinecone.create_index(
        index_name,
        dimension=384,
        metric="cosine"
    )
index = pinecone.Index(index_name)

#This code creates a Pinecone index for extractive question answering.

#First, it sets the index_name variable to a string value that represents the name of the index.

#Next, it checks if the index already exists by calling pinecone.list_indexes(). If the index does not exist, it creates a new index using pinecone.create_index(). The dimension parameter is set to 384, which represents the dimensionality of the vectors that will be added to the index. The metric parameter is set to "cosine", which specifies that cosine similarity will be used to measure distances between vectors in the index.

#Finally, it creates an instance of the Pinecone Index class for this index using pinecone.Index(), and assigns it to a variable named index. This instance can be used to add vectors to the index and query its contents.

In [None]:
import torch
from sentence_transformers import SentenceTransformer


device = 0 if torch.cuda.is_available() else -1

retriever = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)
retriever

# This code imports the torch package and the SentenceTransformer class from the sentence_transformers package.

# The device variable is set to 0 if a GPU is available, otherwise it is set to -1. This will determine whether or not we use a GPU for computing sentence embeddings.

# Next, an instance of the SentenceTransformer class is created with the model name 'multi-qa-MiniLM-L6-cos-v1'. This model has been pre-trained on a variety of NLP tasks and can be used to generate sentence embeddings. The device parameter specifies whether to use a GPU or CPU for computation.

# Finally, the instance of the SentenceTransformer class is assigned to a variable named retriever. This instance can be used to generate embeddings for sentences or paragraphs.

In [None]:
from tqdm.auto import tqdm


batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    
    i_end = min(i+batch_size, len(df))
    batch = df.iloc[i:i_end]
    emb = retriever.encode(batch["context"].tolist()).tolist()
    meta = batch.to_dict(orient="records")
    ids = [f"{idx}" for idx in range(i, i_end)]
    to_upsert = list(zip(ids, emb, meta))
    _ = index.upsert(vectors=to_upsert)

index.describe_index_stats()


# First, it imports the tqdm package for progress tracking during iteration.

# The batch_size variable is set to 64, which represents the number of sentences that will be processed in each batch.

# Next, a for loop is used to iterate through the rows of the DataFrame in batches. The tqdm() function is used to track progress during iteration.

# Inside the loop, a batch of sentences is extracted from the DataFrame using .iloc[i:i_end], where i and i_end represent the start and end indices of each batch.

# The retriever.encode() function is then called on this batch of sentences to generate their embeddings, which are converted to a list using .tolist().

# The metadata associated with each sentence (i.e., its title) is extracted using .to_dict(orient="records"), and assigned to a variable named meta.

# A list of unique IDs is generated using a list comprehension, and assigned to a variable named ids.

# Finally, these IDs, embeddings, and metadata are combined into a list of tuples using zip(), and passed as an argument to the .upsert() method of the Pinecone index instance created earlier. This method inserts or updates vectors in bulk.

# After all vectors have been added to the index, the .describe_index_stats() method is called on the index instance to print out some statistics about its contents.

In [None]:
from transformers import pipeline

model_name = "deepset/electra-base-squad2"
reader = pipeline(tokenizer=model_name, model=model_name, task="question-answering", device=device)

# This code imports the pipeline function from the transformers package. This function is used to create a pipeline for a given NLP task, which can be used to process text data.

# The model_name variable is set to "deepset/electra-base-squad2", which represents a pre-trained Electra model fine-tuned on the SQuAD 2.0 dataset. This model can be used for question answering tasks.

# An instance of the pipeline is created by calling pipeline() with several arguments: the tokenizer name and model name, both set to model_name, the task parameter set to "question-answering", and the device parameter set to the value of the device variable (0 if GPU is available, -1 otherwise).

# Finally, this instance is assigned to a variable named reader. This instance can be used to perform question answering on text data.

In [None]:
def get_context(question, top_k):
    xq = retriever.encode([question]).tolist()
    xc = index.query(xq, top_k=top_k, include_metadata=True)
    c = [x["metadata"]["context"] for x in xc["matches"]]
    return c

question = "What is the capital of France?"
context = get_context(question, top_k = 1)
context

# This code defines a function get_context() that takes a question and the number of top matches to return as input.

# Inside the function, the retriever.encode() function is called on the question to generate its embedding. This embedding is then used to query the Pinecone index using .query(), with top_k and include_metadata parameters set to top_k and True, respectively. This returns a dictionary containing information about the top k matches.

# The metadata associated with each match (i.e., its context) is extracted from this dictionary using a list comprehension, and assigned to a variable named c.

# Finally, this list of contexts is returned as output.

# Outside of the function, a sample question is defined in the variable question. The get_context() function is then called with this question and a value of 1 for top_k, and the resulting context(s) are assigned to a variable named context.

In [None]:
from pprint import pprint


def extract_answer(question, context):
    results = []
    for c in context:
        
        answer = reader(question=question, context=c)
        
        answer["context"] = c
        results.append(answer)
    
    sorted_result = pprint(sorted(results, key=lambda x: x["score"], reverse=True))
    return sorted_result

extract_answer(question, context)

# This code defines a function named extract_answer() that takes a question and a list of contexts as input.

# Inside the function, an empty list named results is created to store the answers for each context. Then, a for loop is used to iterate through each context in the input list.

# For each context, the reader() function is called with the input question and context as arguments. This function uses the pre-trained Electra model fine-tuned on SQuAD 2.0 dataset to extract an answer from the given context. The extracted answer along with its score are stored in a dictionary and appended to the results list.

# After all contexts have been processed, the results list is sorted in descending order of score using a lambda function and printed using pprint().

# Finally, this sorted result is returned from the function.

# Outside of the function, the previously defined variables question and context are passed as arguments to extract_answer(). This will return a sorted list of answers along with their scores for each context in context.

In [None]:
question = "What is the highest mountain in the world?"
context = get_context(question, top_k=1)
extract_answer(question, context)

In [None]:
question = "What is the currency of Mexico?"
context = get_context(question, top_k=1)
extract_answer(question, context)

In [None]:
question = question = "Who wrote the novel To Kill a Mockingbird?"
context = get_context(question, top_k=3)
extract_answer(question, context)

In [None]:
pip install streamlit

# Streamlit is installed for generation of frontend.

In [None]:
# Import required libraries
import streamlit as st
import pinecone
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from kaggle_secrets import UserSecretsClient

# Get Pinecone API key from Kaggle secrets
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("PINECONE_KEY")

# Define function to initialize Pinecone index and return it as a Streamlit session state variable
@st.experimental_singleton
def init_pinecone():
    # Initialize Pinecone with API key and environment settings
    pinecone.init(api_key=secret_value_0, environment="us-west1-gcp-free")
    # Return initialized index object
    return pinecone.Index("extractive-question-answering")
    
# Define function to initialize SentenceTransformer and Transformers models and return them as a tuple as a Streamlit session state variable 
@st.experimental_singleton
def init_models():
    # Initialize SentenceTransformer model for semantic search using multi-qa-MiniLM-L6-cos-v1 model 
    retriever = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
    # Define name of Transformers model for question answering 
    model_name = 'deepset/electra-base-squad2'
    # Initialize Transformers model for question answering using defined model name 
    reader = pipeline(tokenizer=model_name, model=model_name, task='question-answering')
    # Return initialized models as tuple 
    return retriever, reader

# Initialize Pinecone index and models as Streamlit session state variables 
st.session_state.index = init_pinecone()
retriever, reader = init_models()

# Define function to create cards with search results 
def card(title, context, score):
    return st.markdown(f"""
    <div class="container-fluid">
        <div class="row align-items-start">
             <div  class="col-md-12 col-sm-12">
                 <b>{title}</b>
                 <br>
                 <span style="color: #808080;">
                     <small>{context}</small>
                     [<b>Score: </b>{score}]
                 </span>
             </div>
        </div>
     </div>
        """, unsafe_allow_html=True)

# Set Streamlit app title and description 
st.title("")
st.write("""
# VectorFinder

Discover the power of semantic search with VectorFinder


Created By : Gourav Mohanty
""")

# Add Bootstrap CSS stylesheet to app 
st.markdown("""
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
""", unsafe_allow_html=True)

# Define function to run search query and display results 
def run_query(query):
    # Encode query using SentenceTransformer model 
    xq = retriever.encode([query]).tolist()
    try:
        # Query Pinecone index with encoded query and get top 3 matches along with metadata 
        xc = st.session_state.index.query(xq, top_k=3, include_metadata=True)
    except:
        # If error occurs while querying Pinecone index, force reload of index 
        pinecone.init(api_key=secret_value_0, environment="eu-west4-gcp")
        st.session_state.index = pinecone.Index("extractive-question-answering")
        xc = st.session_state.index.query(xq, top_k=3, include_metadata=True)

    # Initialize empty list to store search results 
    results = []
    for match in xc['matches']:
        # Use Transformers model to get answer to query from context 
        answer = reader(question=query, context=match["metadata"]['context'])
        # Append metadata and answer to search results list 
        answer["title"] = match["metadata"]['title']
        answer["context"] = match["metadata"]['context']
        results.append(answer)

    # Sort search results by score in descending order 
    sorted_result = sorted(results, key=lambda x: x['score'], reverse=True)

    # Create card for each search result and display it on the app 
    for r in sorted_result:
        answer = r["answer"]
        context = r["context"].replace(answer, f"<mark>{answer}</mark>")
        title = r["title"].replace("_", " ")
        score = round(r["score"], 4)
        card(title, context, score)

# Get user input query from text input field on app 
query = st.text_input("Search!", "")

# If user has entered a query, run the query and display the search results on the app using cards 
if query != "":
    run_query(query)


In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
    
# Installing ngrok

In [None]:
!unzip ngrok-stable-linux-amd64.zip

In [None]:
get_ipython().system_raw('./ngrok http 8501 &')

In [None]:
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    'import sys, json; print("Execute the next cell and the go to the following URL: " +json.load(sys.stdin)["tunnels"][0]["public_url"])'

In [None]:
import streamlit as st
from pyngrok import ngrok

# Set ngrok auth token (only needed once)
ngrok.set_auth_token("2PYs1fPvxkScH9oIANIZXTgjHG3_5rQEz6RwxWtaSkNozeZfv")

# Start ngrok tunnel
# public_url = ngrok.connect(addr="http://localhost:4040", proto="http", options={"bind_tls": True}).public_url


# Run Streamlit app
# if __name__ == '__main__':
#     main()

In [None]:
!streamlit run ./my_app.py

# Running Streamlit application