In [2]:
# !pip install -U langchain-community
# !pip install sentence-transformers
# !pip install faiss-cpu
# !pip install --upgrade langchain
# !pip install fitz
# !pip install PyMuPDF

In [3]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

### Load PDF

In [4]:
import fitz  # PyMuPDF
from PIL import Image
import io

def extract_text(pdf_path):
    """
    Extract text from a single PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text from the PDF.
    """
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")
    return text

def extract_texts_from_pdfs(pdf_paths):
    """
    Extract text from each PDF file in the list and create Document objects.

    Args:
        pdf_paths (list of str): List of paths to PDF files.

    Returns:
        list of Document: List of Document objects containing the extracted text.
    """
    docs = []
    for pdf_path in pdf_paths:
        text = extract_text(pdf_path)
        doc = Document(page_content=text, metadata={"source": pdf_path})
        docs.append(doc)
    return docs

In [5]:
### List of PDF files(All files) ###
pdf_paths = ["attention.pdf", "Multimodal.pdf"]

# Extract text from each PDF and create Document objects
docs = extract_texts_from_pdfs(pdf_paths)

### Chunk

In [6]:
def split_documents_into_chunks(docs, chunk_size=500, chunk_overlap=100):
    """
    Splits the given documents into chunks of specified size with overlap.

    Args:
        docs (list): List of documents to split.
        chunk_size (int): Size of each chunk. Default is 500 characters.
        chunk_overlap (int): Overlap size between chunks. Default is 100 characters.

    Returns:
        dict: Dictionary of lists containing split documents with chunks per original document.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    doc_chunks = {}
    for doc in docs:
        doc_chunks[doc.metadata["source"]] = text_splitter.split_documents([doc])
    return doc_chunks

In [7]:
def add_chunk_numbers_to_metadata(doc_chunks):
    """
    Adds chunk numbers to the metadata of each split document.

    Args:
        doc_chunks (dict): Dictionary of lists containing split documents.

    Returns:
        dict: Dictionary of lists containing split documents with updated metadata.
    """
    for chunks in doc_chunks.values():
        for idx, chunk in enumerate(chunks):
            chunk.metadata["chunk"] = idx
    return doc_chunks

In [8]:
# Split the documents into chunks
doc_splits = split_documents_into_chunks(docs)
# Add chunk number to metadata
doc_splits = add_chunk_numbers_to_metadata(doc_splits)

In [9]:
# Print number of splits per document
for doc_source, chunks in doc_splits.items():
    print(f'Document {doc_source} has {len(chunks):,} splits')

Document attention.pdf has 102 splits
Document Multimodal.pdf has 232 splits


In [10]:
# Print first 3 chunks for each document
for doc_source, chunks in doc_splits.items():
    print(f'--- Document: {doc_source} ---')
    for split in chunks[:3]:
        print(f'---Page Content---\n{split.page_content}')
        print(f'Metadata:\n{split.metadata}')
        print()

--- Document: attention.pdf ---
---Page Content---
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
Metadata:
{'source': 'attention.pdf', 'chunk': 0}

---Page Content---
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence 

### Embedding

In [11]:
# embedding model - Stronger model can be considered

# SciBERT(Allen Institute for AI) - for academic(science) paper including computer science - maximum 512 tokens
embeddings = HuggingFaceEmbeddings(model_name="allenai/scibert_scivocab_uncased")

# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # small model(microsoft)
# embeddings = HuggingFaceEmbeddings(model_name="roberta-large") # RoBERTa - large (facebook)　- Longer context
# embeddings = HuggingFaceEmbeddings(model_name="roberta-base") # RoBERTa - base(facebook)　- Longer context

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

### Vector Store

In [12]:
def configure_faiss_vector_store(doc_splits, embeddings):
    """
    Configures FAISS as the vector store using the provided document splits and embeddings.

    Args:
        doc_splits (dict): Dictionary of lists containing split documents.
        embeddings (Embeddings): Embeddings to be used for FAISS.

    Returns:
        dict: Dictionary of FAISS vector stores per document.
    """
    vector_stores = {}
    for doc_source, chunks in doc_splits.items():
        vector_stores[doc_source] = FAISS.from_documents(chunks, embeddings)
    return vector_stores

In [13]:
# Configure FAISS as Vector Store
%%time
vector_db = configure_faiss_vector_store(doc_splits, embeddings)

CPU times: user 4min 13s, sys: 13.6 s, total: 4min 26s
Wall time: 4min 29s


In [14]:
vector_db

{'attention.pdf': <langchain_community.vectorstores.faiss.FAISS at 0x7fbf517e2620>,
 'Multimodal.pdf': <langchain_community.vectorstores.faiss.FAISS at 0x7fbf517e3a90>}

In [15]:
# print("Number of documents in the FAISS index:", vector_db.index.ntotal)
# Print number of documents in the FAISS index for each document
for doc_source, faiss_index in vector_db.items():
    print(f"Number of documents in the FAISS index for {doc_source}: {faiss_index.index.ntotal}")

Number of documents in the FAISS index for attention.pdf: 102
Number of documents in the FAISS index for Multimodal.pdf: 232


In [16]:
def create_retrievers(vector_stores, search_type="similarity", k=5):
    """
    Exposes the vector store index to retrievers for multiple documents.

    Args:
        vector_stores (dict): Dictionary of FAISS vector stores per document.
        search_type (str): The type of search to perform. Default is "similarity".
        k (int): The number of documents to return. Default is 5.

    Returns:
        dict: Dictionary of retrievers per document.
    """
    retrievers = {}
    for doc_source, vector_store in vector_stores.items():
        retrievers[doc_source] = vector_store.as_retriever(
            search_type=search_type, search_kwargs={"k": k}
        )
    return retrievers

In [17]:
# Create retrievers for each document and store them in a dictionary
retrievers = create_retrievers(vector_db)

### Retreive contexts

In [18]:
def process_query(query: str, retriever):
    """
    Processes the query using the provided retriever to retrieve relevant document chunks.

    Args:
        query (str): The query string to search for relevant documents.
        retriever: The retriever object configured to use the vector store for document retrieval.

    Returns:
        str: A string containing the formatted content and metadata of the retrieved document chunks.
    """
    # Retrieve chunks based on the query
    docs = retriever.get_relevant_documents(query)

    # Initialize an empty string to collect all outputs
    full_output = ""

    for i, doc in enumerate(docs, 1):
        chunk_output = f"-----Chunk {i}------\n"
        chunk_output += f"Content: {doc.page_content}...\n"
        chunk_output += f"Metadata {doc.metadata}\n\n"

        # Append the chunk output to the full output
        full_output += chunk_output

    return full_output

In [19]:
# Sample Query

# query = "What is the main hypothesis or research question addressed in the first academic article?"
# query = "What is the regularization addressed in the academic article?"
query = "What is the main hypothesis or research question addressed in the first academic article?"
# query = "What is the Attention addressed in the academic article?"

In [20]:
# Retrieve chunks　from the first document
# retriever = retrievers["attention.pdf"]
retriever = retrievers[pdf_paths[0]]
retrieved_output = process_query(query, retriever)

# Print chunks
print(retrieved_output)

-----Chunk 1------
Content: opinion
.
<EOS>
<pad>
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the
sentence. We give two such examples above, from two different heads from the encoder self-attention
at layer 5 of 6. The heads clearly learned to perform different tasks.
15...
Metadata {'source': 'attention.pdf', 'chunk': 101}

-----Chunk 2------
Content: length n is smaller than the representation dimensionality d, which is most often the case with
sentence representations used by state-of-the-art models in machine translations, such as word-piece
[38] and byte-pair [31] representations. To improve computational performance for tasks involving
very long sequences, self-attention could be restricted to considering only a neighborhood of size r in
6
the input sequence centered around the respective output position.

  warn_deprecated(


### Setup LLM

In [24]:
# pip install groq

In [25]:
from groq import Groq

def get_groq_response(client, prompt, model="llama3-70b-8192", max_tokens=2048, temperature=0.0):
    """
    Generates a response using the provided client, model, prompt, and specified parameters.

    Args:
        client: The client object to interact with the API.
        prompt (str): The prompt to generate a response for.
        model (str, optional): The model identifier to use for generating the response. Default is "llama3-70b-8192".
        max_tokens (int, optional): The maximum number of tokens for the generated response. Default is 2048.
        temperature (float, optional): The temperature setting for the response generation. Default is 0.0.

    Returns:
        tuple: The generated response content and usage statistics.
    """
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
            max_tokens=max_tokens,
            temperature=temperature
        )
        return chat_completion.choices[0].message.content, chat_completion.usage
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

In [26]:
client = Groq(
    api_key="YOUR_API_KEY",
)

In [27]:
prompt = "Hello"
response = get_groq_response(client, prompt)
print(response[0])

Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?


### RAG - HyDE

In [28]:
# prompt for RAG - HyDE
instruction_hyde = """
### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.　If the information in the "Context" below seems relevant to "Users' query", please refer to it.

### User’s query ###
{USER_QUERY}

### Context ###
{CONTEXT_HYDE}

### Output ###
"""

In [29]:
def generate_prompt_hyde(instruction, user_query, context_hyde):
    """
    Generates a prompt for HyDE by replacing placeholders in the instruction template with the user's query and context.

    Args:
        instruction (str): The template instruction containing placeholders.
        user_query (str): The user's query to be inserted into the instruction.
        context_hyde (str): The context for creating a hypothetical answer to be inserted into the instruction.

    Returns:
        str: The generated instruction with the placeholders replaced by the user's query and context.
    """
    instruction = instruction.replace("{USER_QUERY}", user_query)
    instruction = instruction.replace("{CONTEXT_HYDE}", context_hyde)
    return instruction

In [30]:
# user_query = "What is the regularization addressed in the academic article?"
user_query = "What is the main hypothesis or research question addressed in the first academic article?"
# user_query  = "What is the Attention addressed in the academic article?"

In [33]:
# read sample summary table(for attention.pdf)
import pandas as pd
table_summary = pd.read_csv('summaries.csv')
context_hyde = table_summary['description'].iloc[0]

print(context_hyde)
# sample - abstract
# context_hyde = """
# Abstract
# The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English- to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.
# """

The paper "Attention Is All You Need" presents a new neural network architecture called the Transformer, designed for sequence transduction tasks like machine translation. The key innovation of the Transformer is that it relies entirely on attention mechanisms to draw global dependencies between input and output, eschewing the recurrence and convolution operations used in previous state-of-the-art models.

The authors motivate the use of self-attention by highlighting three desirable characteristics: 1) reduced sequential computation, allowing more parallelization, 2) shorter path lengths between long-range dependencies in the network, and 3) more interpretable attention patterns. 

The Transformer architecture consists of stacked self-attention and point-wise, fully connected layers for both the encoder and decoder. The attention mechanism used is "scaled dot-product attention", which the authors found to be faster and more space-efficient than additive attention. They also introduce 

In [34]:
# Create prompt for HyDE
prompt_hyde = generate_prompt_hyde(instruction_hyde, user_query, context_hyde)
print(prompt_hyde)


### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.　If the information in the "Context" below seems relevant to "Users' query", please refer to it.

### User’s query ###
What is the main hypothesis or research question addressed in the first academic article?

### Context ###
The paper "Attention Is All You Need" presents a new neural network architecture called the Transformer, designed for sequence transduction tasks like machine translation. The key innovation of the Transformer is that it relies entirely on attention mechanisms to draw global dependencies between input and output, eschewing the recurrence and convolution operations used in previous state-of-the-art models.

The authors motivate the use of self-attention by highlighting three desirable characteristics: 1) reduced sequential computation, allowing more parallelization, 2) shorter path lengths between long-range dependencies in the network, and 3) more 

In [35]:
# Get a hypothetical answer
response = get_groq_response(client, prompt_hyde)
print(response[0])

The main hypothesis or research question addressed in the first academic article "Attention Is All You Need" is whether attention mechanisms alone can replace the recurrent and convolutional layers commonly used in previous sequence transduction models, and if so, what benefits this approach would bring in terms of training efficiency and performance.


In [36]:
# Find chunks based on similarity
print(process_query(response[0], retrievers[pdf_paths[0]]))

-----Chunk 1------
Content: Attention mechanisms have become an integral part of compelling sequence modeling and transduc-
tion models in various tasks, allowing modeling of dependencies without regard to their distance in
the input or output sequences [2, 19]. In all but a few cases [27], however, such attention mechanisms
are used in conjunction with a recurrent network.
In this work we propose the Transformer, a model architecture eschewing recurrence and instead...
Metadata {'source': 'attention.pdf', 'chunk': 10}

-----Chunk 2------
Content: To the best of our knowledge, however, the Transformer is the ﬁrst transduction model relying
entirely on self-attention to compute representations of its input and output without using sequence-
aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate
self-attention and discuss its advantages over models such as [17, 18] and [9].
3
Model Architecture
Most competitive neural sequence transduction mode

### Extract thesis/figure/table numbers from user's query and search descriptions based on numbers

Please refer to another jupyter notebook for the detail  
https://github.com/daichi6/llm-hackathon-insightai/blob/main/notebooks/extract_query.ipynb

In [37]:
# sample figure/image description
context_figure_table = ['description: --']

### Provide a response given retreived contexts

In [38]:
instruction_final = """
### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.
If the information in the "Figure/Table Context" and "Text Context" below seem relevant to "Users' query", please refer to them.
"Text Context" includes several chunks from different parts of an academic paper. "Figure/Table Context" includes the descriptions related to figures or tables in an academic paper.
Please refer only to the relevant contexts for your response. There is no need to include unrelated context in your response.
If the user asks about a specific figure or table and the information is contained in the Figure/Table Context, please ensure that this information is included in your response.
If you determine that the previous conversation history is relevant, please also refer to that information to answer the user's query.　Especially when the the contexts below are empty, please answer the user's most recent query　based on the conversation history(the user's previous queries and your responses).
If the conversation is continuing from the previous session and no additional information is needed, you may refer to the previous conversation history and might not need to use the contexts below. (e.g., User's query: Please make your response brief).
If the contexts and the previous conversation history do not contain the necessary information and it is difficult to answer even with general knowledge and previous context, please respond with 'The information provided is insufficient to answer your question.　Could you please clarify your question?'.

##### User’s query #####
{USER_QUERY}


##### Figure/Table Context #####
{CONTEXT_FIGURE_TABLE}

##### Text Context #####
{CONTEXT_RAG_HYDE}

{CONTEXT_RAG_GENERAL}


##### Output #####
"""

In [39]:
def generate_prompt_final(instruction, user_query, context_figure_table, context_rag_hyde, context_rag_general):
    """
    Generates a final prompt by replacing placeholders in the instruction template with the user's query and various contexts.

    Args:
        instruction (str): The template instruction containing placeholders.
        user_query (str): The user's query to be inserted into the instruction.
        context_figure_table (str): The context(description) related to figure and table to be inserted into the instruction.
        context_rag_hyde (str): The context retreived from RAG HyDE to be inserted into the instruction.
        context_rag_general (str): The general context retreived from RAG to be inserted into the instruction.

    Returns:
        str: The generated instruction with the placeholders replaced by the user's query and contexts.
    """
    instruction = instruction.replace("{USER_QUERY}", user_query)
    instruction = instruction.replace("{CONTEXT_FIGURE_TABLE}", context_figure_table)
    instruction = instruction.replace("{CONTEXT_RAG_HYDE}", context_rag_hyde)
    instruction = instruction.replace("{CONTEXT_RAG_GENERAL}", context_rag_general)
    return instruction

In [40]:
# create contexts - RAG(General)
context_rag_general = process_query(user_query, retrievers[pdf_paths[0]])
print(context_rag_general)

-----Chunk 1------
Content: opinion
.
<EOS>
<pad>
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
Figure 5: Many of the attention heads exhibit behaviour that seems related to the structure of the
sentence. We give two such examples above, from two different heads from the encoder self-attention
at layer 5 of 6. The heads clearly learned to perform different tasks.
15...
Metadata {'source': 'attention.pdf', 'chunk': 101}

-----Chunk 2------
Content: length n is smaller than the representation dimensionality d, which is most often the case with
sentence representations used by state-of-the-art models in machine translations, such as word-piece
[38] and byte-pair [31] representations. To improve computational performance for tasks involving
very long sequences, self-attention could be restricted to considering only a neighborhood of size r in
6
the input sequence centered around the respective output position.

In [41]:
# create contexts - RAG(Hyde)
context_rag_hyde = process_query(response[0], retrievers[pdf_paths[0]])
print(context_rag_hyde)

-----Chunk 1------
Content: Attention mechanisms have become an integral part of compelling sequence modeling and transduc-
tion models in various tasks, allowing modeling of dependencies without regard to their distance in
the input or output sequences [2, 19]. In all but a few cases [27], however, such attention mechanisms
are used in conjunction with a recurrent network.
In this work we propose the Transformer, a model architecture eschewing recurrence and instead...
Metadata {'source': 'attention.pdf', 'chunk': 10}

-----Chunk 2------
Content: To the best of our knowledge, however, the Transformer is the ﬁrst transduction model relying
entirely on self-attention to compute representations of its input and output without using sequence-
aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate
self-attention and discuss its advantages over models such as [17, 18] and [9].
3
Model Architecture
Most competitive neural sequence transduction mode

In [42]:
# create prompt for a final response
prompt_final = generate_prompt_final(instruction_final, user_query, str(context_figure_table), context_rag_hyde, context_rag_general)
print(prompt_final)


### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.
If the information in the "Figure/Table Context" and "Text Context" below seem relevant to "Users' query", please refer to them.
"Text Context" includes several chunks from different parts of an academic paper. "Figure/Table Context" includes the descriptions related to figures or tables in an academic paper.
Please refer only to the relevant contexts for your response. There is no need to include unrelated context in your response.
If the user asks about a specific figure or table and the information is contained in the Figure/Table Context, please ensure that this information is included in your response.
If you determine that the previous conversation history is relevant, please also refer to that information to answer the user's query.　Especially when the the contexts below are empty, please answer the user's most recent query　based on the conversation history(the 

In [43]:
# Get final response
response = get_groq_response(client, prompt_final)
print(response[0])

The main hypothesis or research question addressed in the first academic article is: Can a model architecture that relies entirely on self-attention mechanisms, without using recurrent networks or convolution, be effective for sequence modeling and transduction tasks?
