In [2]:
# !pip install -U langchain-community
# !pip install sentence-transformers
# !pip install faiss-cpu
# !pip install --upgrade langchain
# !pip install fitz
# !pip install PyMuPDF

In [4]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

### Load PDF

In [55]:
import fitz  # PyMuPDF
from PIL import Image
import io

def extract_text(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")
    return text

# Load Sample PDF
pdf_path = "attention.pdf"

text = extract_text(pdf_path)
# print(text)

In [57]:
# text to document
doc = Document(page_content=text)
docs = [doc]

### Chunk

In [48]:
def split_documents_into_chunks(docs, chunk_size=500, chunk_overlap=100):
    """
    Splits the given documents into chunks of specified size with overlap.

    Args:
        docs (list): List of documents to split.
        chunk_size (int): Size of each chunk. Default is 500 characters.
        chunk_overlap (int): Overlap size between chunks. Default is 100 characters.

    Returns:
        list: List of split documents with chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    return text_splitter.split_documents(docs)

In [49]:
def add_chunk_numbers_to_metadata(doc_splits):
    """
    Adds chunk numbers to the metadata of each split document.

    Args:
        doc_splits (list): List of split documents.

    Returns:
        list: List of split documents with updated metadata.
    """
    for idx, split in enumerate(doc_splits):
        split.metadata["chunk"] = idx
    return doc_splits

In [50]:
# Split the documents into chunks
doc_splits = split_documents_into_chunks(docs)
# Add chunk number to metadata
doc_splits = add_chunk_numbers_to_metadata(doc_splits)

In [51]:
print(f'Created {len(doc_splits):,} splits')

Created 102 splits


In [52]:
for split in doc_splits[:3]:
    print(f'---Page Content---\n{split.page_content}')
    print(f'Metadata:\n{split.metadata}')
    print()

---Page Content---
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
Metadata:
{'chunk': 0}

---Page Content---
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine trans

### Embedding

In [19]:
# embedding model - Stronger model can be considered

# SciBERT(Allen Institute for AI) - for academic(science) paper including computer science - maximum 512 tokens
embeddings = HuggingFaceEmbeddings(model_name="allenai/scibert_scivocab_uncased")

# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # small model(microsoft)
# embeddings = HuggingFaceEmbeddings(model_name="roberta-large") # RoBERTa - large (facebook)　- Longer context
# embeddings = HuggingFaceEmbeddings(model_name="roberta-base") # RoBERTa - base(facebook)　- Longer context

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

### Vector Store

In [62]:
def configure_faiss_vector_store(doc_splits, embeddings):
    """
    Configures FAISS as the vector store using the provided document splits and embeddings.

    Args:
        doc_splits (list): List of split documents.
        embeddings (Embeddings): Embeddings to be used for FAISS.

    Returns:
        FAISS: Configured FAISS vector store.
    """
    return FAISS.from_documents(doc_splits, embeddings)

In [63]:
# Configure FAISS as Vector Store
%%time
vector_db = configure_faiss_vector_store(doc_splits, embeddings)

CPU times: user 50.9 s, sys: 2.87 s, total: 53.8 s
Wall time: 54 s


In [64]:
print("Number of documents in the FAISS index:", vector_db.index.ntotal)

Number of documents in the FAISS index: 102


In [65]:
def create_retriever(vector_store, search_type="similarity", k=5):
    """
    Exposes the vector store index to a retriever.

    Args:
        vector_store: The vector store (e.g., FAISS, Annoy, etc.).
        search_type (str): The type of search to perform. Default is "similarity".
        k (int): The number of documents to return. Default is 5.

    Returns:
        retriever: The configured retriever.
    """
    return vector_store.as_retriever(
        search_type=search_type, search_kwargs={"k": k}
    )

In [66]:
 # Expose index to the retriever
retriever = create_retriever(vector_db)

### Retreive contexts

In [67]:
def process_query(query: str, retriever):
    """
    Processes the query using the provided retriever to retrieve relevant document chunks.

    Args:
        query (str): The query string to search for relevant documents.
        retriever: The retriever object configured to use the vector store for document retrieval.

    Returns:
        str: A string containing the formatted content and metadata of the retrieved document chunks.
    """
    # Retrieve chunks based on the query
    docs = retriever.get_relevant_documents(query)

    # Initialize an empty string to collect all outputs
    full_output = ""

    for i, doc in enumerate(docs, 1):
        chunk_output = f"-----Chunk {i}------\n"
        chunk_output += f"Content: {doc.page_content}...\n"
        chunk_output += f"Metadata {doc.metadata}\n\n"

        # Append the chunk output to the full output
        full_output += chunk_output

    return full_output

In [68]:
# Sample Query

# query = "What is the main hypothesis or research question addressed in the first academic article?"
# query = "What is the regularization addressed in the academic article?"
# query = "What is the main hypothesis or research question addressed in the first academic article?"
query = "What is the Attention addressed in the academic article?"

In [69]:
# Retrieve chunks
retrieved_output = process_query(query, retriever)

# Print chunks
print(retrieved_output)

-----Chunk 1------
Content: opinion
.
<EOS>
<pad>
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the
sentence. We give two such examples above, from two different heads from the encoder self-attention
at layer 5 of 6. The heads clearly learned to perform different tasks.
15...
Metadata {'chunk': 101}

-----Chunk 2------
Content: opinion
.
<EOS>
<pad>
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
Figure 4: Two attention heads, also in layer 5 of 6, apparently involved in anaphora resolution. Top:
Full attentions for head 5. Bottom: Isolated attentions from just the word ‘its’ for attention heads 5
and 6. Note that the attentions are very sharp for this word.
14
Input-Input Layer5
The
Law
will
never
be
perfect
,
but
its
application
shou

### Setup LLM

In [24]:
# pip install groq

In [78]:
from groq import Groq

def get_groq_response(client, prompt, model="llama3-70b-8192", max_tokens=2048, temperature=0.0):
    """
    Generates a response using the provided client, model, prompt, and specified parameters.

    Args:
        client: The client object to interact with the API.
        prompt (str): The prompt to generate a response for.
        model (str, optional): The model identifier to use for generating the response. Default is "llama3-70b-8192".
        max_tokens (int, optional): The maximum number of tokens for the generated response. Default is 2048.
        temperature (float, optional): The temperature setting for the response generation. Default is 0.0.

    Returns:
        tuple: The generated response content and usage statistics.
    """
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
            max_tokens=max_tokens,
            temperature=temperature
        )
        return chat_completion.choices[0].message.content, chat_completion.usage
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

In [75]:
client = Groq(
    api_key="YOUR_API_KEY",
)

In [77]:
prompt = "Hello"
response = get_groq_response(client, prompt)
print(response[0])

Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?


### RAG - HyDE

In [81]:
# prompt for RAG - HyDE
instruction_hyde = """
### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.　If the information in the "Context" below seems relevant to "Users' query", please refer to it.

### User’s query ###
{USER_QUERY}

### Context ###
{CONTEXT_HYDE}

### Output ###
"""

In [82]:
def generate_prompt_hyde(instruction, user_query, context_hyde):
    """
    Generates a prompt for HyDE by replacing placeholders in the instruction template with the user's query and context.

    Args:
        instruction (str): The template instruction containing placeholders.
        user_query (str): The user's query to be inserted into the instruction.
        context_hyde (str): The context for creating a hypothetical answer to be inserted into the instruction.

    Returns:
        str: The generated instruction with the placeholders replaced by the user's query and context.
    """
    instruction = instruction.replace("{USER_QUERY}", user_query)
    instruction = instruction.replace("{CONTEXT_HYDE}", context_hyde)
    return instruction

In [91]:
# user_query = "What is the regularization addressed in the academic article?"
user_query = "What is the main hypothesis or research question addressed in the first academic article?"
# user_query  = "What is the Attention addressed in the academic article?"

In [92]:
# Add abstract of the academic paper as a sample, but summary may be better as it can cover a wider range of document(especially for slides case, there is no abstract)
# >>>>>>>>>Need to get appropriate paper and summary by using extraction function<<<<<<<<<

context_hyde = """
Abstract
The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English- to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.
"""

In [93]:
# Create prompt for HyDE
prompt_hyde = generate_prompt_hyde(instruction_hyde, user_query, context_hyde)
print(prompt_hyde)


### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.　If the information in the "Context" below seems relevant to your response, please refer to it.

### User’s query ###
What is the main hypothesis or research question addressed in the first academic article?

### Context ###

Abstract
The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English- to-German translation task, 

In [95]:
# Get a hypothetical answer
response = get_groq_response(client, prompt_hyde)
print(response[0])

The main hypothesis or research question addressed in the first academic article is: Can a sequence transduction model based solely on attention mechanisms, without recurrence and convolutions, achieve superior performance and efficiency in machine translation tasks compared to traditional models that rely on complex recurrent or convolutional neural networks?


In [96]:
# Find chunks based on similarity
print(process_query(response[0], retriever))

-----Chunk 1------
Content: Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to...
Metadata {'chunk': 1}

-----Chunk 2------
Content: Attention mechanisms have become an integral part of compelling sequence modeling and transduc-
tion models in various tasks, allowing modeling of dependencies without regard to their distance in
the input or output sequences [2, 19]. In all but a few cases [27], however, such attention mechanisms
are used in conjunction with a recurrent network.
In this work we propose the Transformer, a model architecture eschewing recurrence and instead...


### Extract thesis/figure/table numbers from user's query and search descriptions based on numbers

In [97]:
# Please refer to another jupyter notebook for the detail

In [103]:
# sample figure/image description
context_figure_table = ['thesis1 description: --']

### Provide a response given retreived contexts

In [104]:
instruction_final = """
### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.
If the information in the "Figure/Table Context" and "Text Context" below seem relevant to "Users' query", please refer to them.
"Text Context" includes several chunks from different parts of an academic paper. "Figure/Table Context" includes the descriptions related to figures or tables in an academic paper.
Please refer only to the relevant contexts for your response. There is no need to include unrelated context in your response.
If the user asks about a specific figure or table and the information is contained in the Figure/Table Context, please ensure that this information is included in your response.
If you determine that the previous conversation history is relevant, please also refer to that information to answer the user's query.　
Additionally, if the conversation is continuing from the previous session and no additional information is needed, you may refer to the previous conversation history and might not need to use the contexts below. (e.g., User's query: Please make your response brief).
If the contexts and the previous conversation history do not contain the necessary information and it is difficult to answer even with general knowledge and previous context, please respond with 'The information provided is insufficient to answer your question.　Could you please clarify your question?'.

##### User’s query #####
{USER_QUERY}


##### Figure/Table Context #####
{CONTEXT_FIGURE_TABLE}

##### Text Context #####
{CONTEXT_RAG_HYDE}

{CONTEXT_RAG_GENERAL}


##### Output #####
"""

In [110]:
def generate_prompt_final(instruction, user_query, context_figure_table, context_rag_hyde, context_rag_general):
    """
    Generates a final prompt by replacing placeholders in the instruction template with the user's query and various contexts.

    Args:
        instruction (str): The template instruction containing placeholders.
        user_query (str): The user's query to be inserted into the instruction.
        context_figure_table (str): The context(description) related to figure and table to be inserted into the instruction.
        context_rag_hyde (str): The context retreived from RAG HyDE to be inserted into the instruction.
        context_rag_general (str): The general context retreived from RAG to be inserted into the instruction.

    Returns:
        str: The generated instruction with the placeholders replaced by the user's query and contexts.
    """
    instruction = instruction.replace("{USER_QUERY}", user_query)
    instruction = instruction.replace("{CONTEXT_FIGURE_TABLE}", context_figure_table)
    instruction = instruction.replace("{CONTEXT_RAG_HYDE}", context_rag_hyde)
    instruction = instruction.replace("{CONTEXT_RAG_GENERAL}", context_rag_general)
    return instruction

In [111]:
# create contexts - RAG(General)
context_rag_general = process_query(user_query, retriever)
print(context_rag_general)

-----Chunk 1------
Content: opinion
.
<EOS>
<pad>
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the
sentence. We give two such examples above, from two different heads from the encoder self-attention
at layer 5 of 6. The heads clearly learned to perform different tasks.
15...
Metadata {'chunk': 101}

-----Chunk 2------
Content: length n is smaller than the representation dimensionality d, which is most often the case with
sentence representations used by state-of-the-art models in machine translations, such as word-piece
[38] and byte-pair [31] representations. To improve computational performance for tasks involving
very long sequences, self-attention could be restricted to considering only a neighborhood of size r in
6
the input sequence centered around the respective output position. This would increase the ma

In [112]:
# create contexts - RAG(Hyde)
context_rag_hyde = process_query(response[0], retriever)
print(context_rag_hyde)

-----Chunk 1------
Content: Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to...
Metadata {'chunk': 1}

-----Chunk 2------
Content: Attention mechanisms have become an integral part of compelling sequence modeling and transduc-
tion models in various tasks, allowing modeling of dependencies without regard to their distance in
the input or output sequences [2, 19]. In all but a few cases [27], however, such attention mechanisms
are used in conjunction with a recurrent network.
In this work we propose the Transformer, a model architecture eschewing recurrence and instead...


In [116]:
# create prompt for a final response
prompt_final = generate_prompt_final(instruction_final, user_query, str(context_figure_table), context_rag_hyde, context_rag_general)
print(prompt_final)


### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.
If the information in the "Figure/Table Context" and "Text Context" below seem relevant to "Users' query", please refer to them. 
"Text Context" includes several chunks from different parts of an academic paper. "Figure/Table Context" includes the descriptions related to figures or tables in an academic paper. 
Please refer only to the relevant contexts for your response. There is no need to include unrelated context in your response.
If the user asks about a specific figure or table and the information is contained in the Figure/Table Context, please ensure that this information is included in your response.
If the contexts do not contain the necessary information and it is difficult to answer even with general knowledge and previous context, please respond with 'The information provided is insufficient to answer your question.　Could you please clarify your question?'

In [117]:
# Get final response
response = get_groq_response(client, prompt_final)
print(response[0])

The main hypothesis or research question addressed in the first academic article is: Can a sequence transduction model relying solely on attention mechanisms, without using recurrence or convolution, achieve state-of-the-art results in machine translation tasks?
