## 1. Install packages

In [5]:
# !pip install -U langchain-community
# !pip install sentence-transformers
# !pip install faiss-cpu
# !pip install --upgrade langchain
# !pip install fitz
# !pip install PyMuPDF
# !pip install groq
# !pip install anthropic

In [6]:
# Install packages
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

import faiss
import os
import pickle


import fitz
from PIL import Image
import io
import json
import pandas as pd
import ast

from groq import Groq
import anthropic

## 2. Functions

In [82]:
# Functions

def extract_text(pdf_path):
    """
    Extract text from a single PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text from the PDF.
    """
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")
    return text

def extract_texts_from_pdfs(pdf_paths):
    """
    Extract text from each PDF file in the list and create Document objects.

    Args:
        pdf_paths (list of str): List of paths to PDF files.

    Returns:
        list of Document: List of Document objects containing the extracted text.
    """
    docs = []
    for pdf_path in pdf_paths:
        text = extract_text(pdf_path)
        doc = Document(page_content=text, metadata={"source": pdf_path})
        docs.append(doc)
    return docs

def split_documents_into_chunks(docs, chunk_size=500, chunk_overlap=100):
    """
    Splits the given documents into chunks of specified size with overlap.

    Args:
        docs (list): List of documents to split.
        chunk_size (int): Size of each chunk. Default is 500 characters.
        chunk_overlap (int): Overlap size between chunks. Default is 100 characters.

    Returns:
        dict: Dictionary of lists containing split documents with chunks per original document.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    doc_chunks = {}
    for doc in docs:
        doc_chunks[doc.metadata["source"]] = text_splitter.split_documents([doc])
    return doc_chunks

def add_chunk_numbers_to_metadata(doc_chunks):
    """
    Adds chunk numbers to the metadata of each split document.

    Args:
        doc_chunks (dict): Dictionary of lists containing split documents.

    Returns:
        dict: Dictionary of lists containing split documents with updated metadata.
    """
    for chunks in doc_chunks.values():
        for idx, chunk in enumerate(chunks):
            chunk.metadata["chunk"] = idx
    return doc_chunks

def configure_faiss_vector_store(doc_splits, embeddings):
    """
    Configures FAISS as the vector store using the provided document splits and embeddings.

    Args:
        doc_splits (dict): Dictionary of lists containing split documents.
        embeddings (Embeddings): Embeddings to be used for FAISS.

    Returns:
        dict: Dictionary of FAISS vector stores per document.
    """
    vector_stores = {}
    for doc_source, chunks in doc_splits.items():
        vector_stores[doc_source] = FAISS.from_documents(chunks, embeddings)
    return vector_stores

def save_faiss_vector_store(vector_db, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

    for doc_source, vector_store in vector_db.items():
        index_path = os.path.join(directory, f"{doc_source}.index")
        faiss.write_index(vector_store.index, index_path)

        # Save docstore and index_to_docstore_id
        metadata_path = os.path.join(directory, f"{doc_source}.metadata")
        with open(metadata_path, 'wb') as f:
            pickle.dump({
                'docstore': vector_store.docstore,
                'index_to_docstore_id': vector_store.index_to_docstore_id
            }, f)

def load_faiss_vector_store(directory, embedding):
    vector_db = {}

    for filename in os.listdir(directory):
        if filename.endswith('.index'):
            doc_source = os.path.splitext(filename)[0]
            index_path = os.path.join(directory, filename)
            metadata_path = os.path.join(directory, f"{doc_source}.metadata")

            # Load FAISS index
            index = faiss.read_index(index_path)

            # Load metadata
            with open(metadata_path, 'rb') as f:
                metadata = pickle.load(f)

            # Reconstruct FAISS vector store
            vector_store = FAISS(
                embedding_function=embedding,
                index=index,
                docstore=metadata['docstore'],
                index_to_docstore_id=metadata['index_to_docstore_id']
            )

            vector_db[doc_source] = vector_store

    return vector_db

def create_retrievers(vector_stores, search_type="similarity", k=5):
    """
    Exposes the vector store index to retrievers for multiple documents.

    Args:
        vector_stores (dict): Dictionary of FAISS vector stores per document.
        search_type (str): The type of search to perform. Default is "similarity".
        k (int): The number of documents to return. Default is 5.

    Returns:
        dict: Dictionary of retrievers per document.
    """
    retrievers = {}
    for doc_source, vector_store in vector_stores.items():
        retrievers[doc_source] = vector_store.as_retriever(
            search_type=search_type, search_kwargs={"k": k}
        )
    return retrievers

def process_query(query: str, retriever):
    """
    Processes the query using the provided retriever to retrieve relevant document chunks.

    Args:
        query (str): The query string to search for relevant documents.
        retriever: The retriever object configured to use the vector store for document retrieval.

    Returns:
        str: A string containing the formatted content and metadata of the retrieved document chunks.
    """
    # Retrieve chunks based on the query
    docs = retriever.get_relevant_documents(query)

    # Initialize an empty string to collect all outputs
    full_output = ""

    for i, doc in enumerate(docs, 1):
        chunk_output = f"-----Chunk {i}------\n"
        chunk_output += f"Content: {doc.page_content}...\n"
        chunk_output += f"Metadata {doc.metadata}\n\n"

        # Append the chunk output to the full output
        full_output += chunk_output

    return full_output

def get_groq_response(client, prompt, model="llama3-70b-8192", max_tokens=2048, temperature=0.0):
    """
    Generates a response using the provided client, model, prompt, and specified parameters.

    Args:
        client: The client object to interact with the API.
        prompt (str): The prompt to generate a response for.
        model (str, optional): The model identifier to use for generating the response. Default is "llama3-70b-8192".
        max_tokens (int, optional): The maximum number of tokens for the generated response. Default is 2048.
        temperature (float, optional): The temperature setting for the response generation. Default is 0.0.

    Returns:
        tuple: The generated response content and usage statistics.
    """
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
            max_tokens=max_tokens,
            temperature=temperature
        )
        return chat_completion.choices[0].message.content, chat_completion.usage
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

def generate_prompt_hyde(instruction, user_query, context_hyde, table_summary):
    """
    Generates a prompt for HyDE by replacing placeholders in the instruction template with the user's query and context.

    Args:
        instruction (str): The template instruction containing placeholders.
        user_query (str): The user's query to be inserted into the instruction.
        context_hyde (str): The context for creating a hypothetical answer to be inserted into the instruction.
        table_insight(str): The summary table for all PDFs.

    Returns:
        str: The generated instruction with the placeholders replaced by the user's query and context.
    """
    instruction = instruction.replace("{USER_QUERY}", user_query)
    instruction = instruction.replace("{CONTEXT_HYDE}", context_hyde)
    instruction = instruction.replace("{SUMMARY_TABLE}", table_summary)

    return instruction

def generate_prompt_final(instruction, user_query, context_figure_table, context_rag_hyde, context_rag_general, table_insight):
    """
    Generates a final prompt by replacing placeholders in the instruction template with the user's query and various contexts.

    Args:
        instruction (str): The template instruction containing placeholders.
        user_query (str): The user's query to be inserted into the instruction.
        context_figure_table (str): The context(description) related to figure and table to be inserted into the instruction.
        context_rag_hyde (str): The context retreived from RAG HyDE to be inserted into the instruction.
        context_rag_general (str): The general context retreived from RAG to be inserted into the instruction.

    Returns:
        str: The generated instruction with the placeholders replaced by the user's query and contexts.
    """
    instruction = instruction.replace("{USER_QUERY}", user_query)
    instruction = instruction.replace("{CONTEXT_FIGURE_TABLE}", context_figure_table)
    instruction = instruction.replace("{CONTEXT_RAG_HYDE}", context_rag_hyde)
    instruction = instruction.replace("{CONTEXT_RAG_GENERAL}", context_rag_general)
    instruction = instruction.replace("{SUMMARY_TABLE}", table_insight)
    return instruction

def generate_prompt_extract_query(instruction, user_query):
    """
    Generates a prompt for extracting keys from the user's query by replacing placeholders in the instruction template.

    Args:
        instruction (str): The template instruction containing a placeholder for the user's query.
        user_query (str): The user's query to be inserted into the instruction.

    Returns:
        str: The generated instruction with the placeholder replaced by the user's query.
    """
    instruction = instruction.replace("{USER_QUERY}", user_query)
    return instruction

def parse_and_convert_keys(json_string):
    """
    Parse the JSON string and convert the string values in the keys list to their appropriate types.

    Args:
    json_string (str): A JSON string representing a list of dictionaries with string values for 'thesis', 'figure', and 'table'.

    Returns:
    list: A list of dictionaries with 'thesis' as int, and 'figure' and 'table' as int or None.
    """
    try:
        keys = json.loads(json_string)
        if not keys:
            return []

        converted_keys = []
        for key in keys:
            converted_key = {
                "thesis": int(key["thesis"]) if key["thesis"] else None,
                "figure": int(key["figure"]) if key["figure"] else None,
                "table": int(key["table"]) if key["table"] else None
            }
            converted_keys.append(converted_key)
        return converted_keys
    except json.JSONDecodeError as e:
        # print(f"JSON decoding error: {e}")
        return []
    except KeyError as e:
        # print(f"Missing key in JSON data: {e}")
        return []
    except ValueError as e:
        # print(f"Value error: {e}")
        return []
    except Exception as e:
        # print(f"An unexpected error occurred: {e}")
        return []

def extract_descriptions(df, keys):
    """
    Extract and format descriptions from the dataframe based on the provided keys.

    Args:
    df (DataFrame): The dataframe containing thesis, figure, table, and description data.
    keys (list): A list of dictionaries with 'thesis' as int, and 'figure' and 'table' as int or None.

    Returns:
    list: A list of formatted descriptions corresponding to the provided keys.
    """
    formatted_descriptions = []

    for key in keys:
        thesis_num = key["thesis"]
        figure_num = key["figure"]
        table_num = key["table"]

        if figure_num is not None:
            description = df[(df["thesis_num"] == thesis_num) & (df["figure_num"] == figure_num)]["description"].values
            prefix = f"thesis{thesis_num} figure{figure_num} description: "
        elif table_num is not None:
            description = df[(df["thesis_num"] == thesis_num) & (df["table_num"] == table_num)]["description"].values
            prefix = f"thesis{thesis_num} table{table_num} description: "
        else:
            description = []
            prefix = ""

        if len(description) > 0:
            formatted_descriptions.append(prefix + description[0])
        else:
            formatted_descriptions.append(prefix + "Description not found")

    return formatted_descriptions

def extract_thesis_numbers(converted_keys):
    """
    Extracts the thesis numbers from a list of dictionaries.

    Args:
    converted_keys (list): A list of dictionaries with 'thesis', 'figure', and 'table' keys.

    Returns:
    list: A list of thesis numbers.
    """
    try:
        thesis_numbers = [item['thesis'] for item in converted_keys]
        return thesis_numbers
    except Exception as e:
        # print(f"An error occurred while extracting thesis numbers: {e}")
        return []

def get_descriptions_for_thesis_summary(thesis_numbers, table_summary):
    """
    Retrieves the descriptions for the given thesis numbers from the table_summary DataFrame.

    Args:
    thesis_numbers (list): A list of thesis numbers.
    table_summary (pd.DataFrame): The DataFrame containing thesis numbers and their descriptions.

    Returns:
    list: A list of descriptions corresponding to the thesis numbers, formatted to indicate which thesis each description belongs to.
    """
    try:
        result = []
        for thesis_num in thesis_numbers:
            description = table_summary.loc[table_summary['thesis_num'] == thesis_num, 'description'].values[0]
            result.append(f"Summary description for thesis {thesis_num}: '{description}'")
        return result
    except Exception as e:
        # print(f"An error occurred while retrieving descriptions: {e}")
        return []


def generate_prompt_extract_thesis_numbers(instruction, user_query):
    instruction = instruction.replace("{USER_QUERY}", user_query)
    return instruction

In [119]:
# Claude
client = anthropic.Anthropic(api_key="YOUR_API_KEY")

def get_response_claude(prompt, max_tokens=4096, temperature=0):
    try:
        message = client.messages.create(
            model= "claude-3-5-sonnet-20240620",
            # model= "claude-3-haiku-20240307",
            max_tokens=max_tokens,
            temperature=temperature,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }
            ]
        )
        return message.content[0].text, message.usage
    except Exception as e:
        print(f"An error occurred: {e}")

## 3. Prompts

In [83]:
# Prompts(Instructions)

instruction_hyde = """
### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.　If the information in the "Context" or "Summary Table" below seem relevant to "Users' query", please refer to it.

### User’s query ###
{USER_QUERY}

### Context ###
{CONTEXT_HYDE}

### Summary Table ###
{SUMMARY_TABLE}

### Output ###
"""

instruction_final = """
### Instructions ###
You are an expert in scientific academic papers. Your task is to answer to "Users' query" below.

If the information in the "Summary Table", "Figure/Table Context" and "Text Context" below seem relevant to "Users' query", please refer to them.
The "Summary Table" summarizes the key points of all academic papers. "Text Context" includes several chunks from different parts of an academic paper.　Each chunk also includes the name of the PDF. "Figure/Table Context" includes the descriptions related to figures or tables in an academic paper.
Please refer only to the relevant contexts for your response. There is no need to include unrelated context in your response. If you refer to the Text Context for your answer, please include the PDF name. There is no need to include the chunk number.
If the user asks about a specific figure or table and the information is contained in the Figure/Table Context, please ensure that this information is included in your response.
If you determine that the previous conversation history is relevant, please also refer to that information to answer the user's query, especially when the the contexts below are empty.
If the contexts and the previous conversation history do not contain the necessary information and it is difficult to answer even with general knowledge and previous context, please respond with 'The information provided is insufficient to answer your question.　Could you please clarify your question?'.

##### User’s query #####
{USER_QUERY}


##### Summary Table #####
{SUMMARY_TABLE}

##### Figure/Table Context #####
{CONTEXT_FIGURE_TABLE}

##### Text Context #####
{CONTEXT_RAG_HYDE}

{CONTEXT_RAG_GENERAL}


##### Output #####
"""

instruction_extract_query = """
### Instructions ###
You are an NLP engineer. Your task is to extract the "numbers" from the user's query below.
The "numbers" mean which academic paper the user is referring to, 2) which figure the user is referring to, and 3) which table the user is referring to.
There may be cases where all, some, or none of these are specified. Enter the number only for the specified fields, and return an empty string "" for fields that are not specified.
Interpret "figure" for terms such as "Chart," "Diagram," or "Image." Interpret "thesis" for terms such as "Academic Paper," "Paper," or "Document."
Please provide your response as a list of objects, each containing thesis, figure, and table.　Please provide your response strictly in the specified format, without including any additional text for formatting. I will use your response directly.
If it is unclear which thesis, figure, or table is being referred to, it is okay to return an empty string. Please do not make any assumptions.

### Output Format ###
Format: a list of objects

### Example user's query1 ###
What is the main hypothesis or research question addressed in the first academic article?

### Example Output1 ###
[
  {
  "thesis": "1",
  "figure": "",
  "table": ""
  }
]

### Example user's query2 ###
Summarize the methodology used in the third academic article. Highlight any unique approaches or techniques employed.

### Example Output2 ###
[
  {
  "thesis": "3",
  "figure": "",
  "table": ""
  }
]


### Example user's query3 ###
Q. From the images and figures in the second article, describe the trend shown in Figure 2. What does it indicate about the research findings?

### Example Output3 ###
[
  {
  "thesis": "2",
  "figure": "2",
  "table": ""
  }
]

### Example user's query4 ###
Q. What can be understood from Image 3 in the third paper?

### Example Output4 ###
[
  {
  "thesis": "3",
  "figure": "3",
  "table": ""
  }
]

### Example user's query4 ###
Q. Please explain Figure 3 and Table 2 of the second academic paper. What do these indicate about the research findings?

### Example Output4 ###
[
  {
  "thesis": "2",
  "figure": "3",
  "table": ""
  },
  {
  "thesis": "2",
  "figure": "",
  "table": "2"
  }
]

### Example user's query5 ###
Q. Please compare table 3 and chart 4 from the second and third theses, respectively.

### Example Output5 ###
[
  {
  "thesis": "2",
  "figure": "",
  "table": "3"
  },
  {
  "thesis": "3",
  "figure": "4",
  "table": ""
  }
]

### Example user's query6 ###
Do you like an apple?

### Example Output6 ###
[
  {
  "thesis": "",
  "figure": "",
  "table": ""
  }
]

### Example user's query7 ###
Considering the previous conversations, please propose a new research direction or hypothesis.

### Example Output7 ###
[
  {
  "thesis": "",
  "figure": "",
  "table": ""
  }
]


### User’s query ###
{USER_QUERY}

### Output ###
"""


instruction_thesis_numbers_extract_query = """

### Instructions ###
You are an NLP engineer. Your task is to extract the academic paper numbers from the user's query below.
The "numbers" mean which academic papers the user is referring to. Interpret "Academic Paper" for terms such as "Thesis," "Paper," or "Document."
The figure or table numbers may be included in the user's query, but please ignore them.
If the user’s query does not specify a particular academic paper, respond with [1,2,3,4,5,6,7,8,9].
Please provide your response as a list format, without any additional text for formatting. I will use your response directly.


### Output Format ###
Format: a list

### Example user's query1 ###
What is the main hypothesis or research question addressed in the first academic article?

### Example Output1 ###
[1]

### Example user's query2 ###
Summarize the methodology used in the third academic article. Highlight any unique approaches or techniques employed.

### Example Output2 ###
[3]

### Example user's query3 ###
From the images and figures in the second article, describe the trend shown in Figure 3. What does it indicate about the research findings?

### Example Output3 ###
[2]

### Example user's query4 ###
Please compare table 3 and chart 4 from the second and third theses, respectively.

### Example Output4 ###
[2,3]

### Example user's query5 ###
What are the encoder and decoder mentioned in these papers?

### Example Output5 ###
[1,2,3,4,5,6,7,8,9]

### Example user's query6 ###
Which paper explains Llama?

### Example Output6 ###
[1,2,3,4,5,6,7,8,9]


### User’s query ###
{USER_QUERY}

### Output ###

"""

## 4. Prepare data

### 4-1. Creating VectorDB

In [86]:
## Creating VectorDB
# Load PDFs
pdf_paths =  ["attention.pdf", "Multimodal.pdf", "Performance Evaluation.pdf", "RAG Agent Resource-1.pdf", "Continual_Pretraining.pdf", "Challenges LLM July 19_23.pdf", "llm_review 2.pdf", "cs224n-2023-lecture11-prompting-rlhf.pdf", "Mistral.pdf"]

# Extract text from each PDF and create Document objects
docs = extract_texts_from_pdfs(pdf_paths)

## Chunk
# Split the documents into chunks
doc_splits = split_documents_into_chunks(docs)
# Add chunk number to metadata
doc_splits = add_chunk_numbers_to_metadata(doc_splits)

## Embedding
# embeddings = HuggingFaceEmbeddings(model_name="allenai/scibert_scivocab_uncased")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

## Vector Store
# Configure FAISS as Vector Store
vector_db = configure_faiss_vector_store(doc_splits, embeddings)

# Save the vector_db
vectordb_path = "vectordb_faiss" # Change it to YOUR PATH
save_faiss_vector_store(vector_db, vectordb_path)

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find ExtGState resource 'a0'

MuPDF error: syntax error: cannot find E

### 4-2. Creating Image/Table Desctiption Table

https://github.com/daichi6/llm-hackathon-insightai/blob/main/notebooks/image_description_generation.ipynb

- Creating Figure/Table Descriptions
- Extracting Figure/Table Numbers from Images using LLM
- Combining them into a Final Table

### 4-3. Creating Summary Table

https://github.com/daichi6/llm-hackathon-insightai/blob/main/notebooks/summary_table_generation.ipynb

- Creating summaries for each academic paper

## 5. Load prepared data

### 5-1. Load VectorDB

In [87]:
## Load prepared vectorDB
# Load the vector_db
vector_db = load_faiss_vector_store(vectordb_path, embeddings)
# Create retrievers for each document and store them in a dictionary
retrievers = create_retrievers(vector_db)

### 5-2. Load Image/Table Desctiption Table

In [89]:
## Load prepared tables
table_figure_table = pd.read_csv("image_analysis_results.csv") # Change it to YOUR PATH
table_figure_table.head()

Unnamed: 0,thesis_num,figure_num,table_num,description
0,1,,,"I apologize, but the image you've provided doe..."
1,1,,,"I apologize, but the image you've provided doe..."
2,1,,1.0,1. Image Type and Overview:\nThis image is a s...
3,1,,2.0,1. Image Type and Overview:\nThis image is a s...
4,1,,3.0,1. Image Type and Overview:\nThis image contai...


### 5-3. Load Summary Table

In [90]:
# Load table with thesis_num and description
table_summary = pd.read_csv("summaries.csv") # Change it to YOUR PATH
table_summary.head()

Unnamed: 0,thesis_num,description
0,1,"The paper ""Attention Is All You Need"" introduc..."
1,2,This text describes the development and capabi...
2,3,This paper presents a comprehensive evaluation...
3,4,The provided text is a comprehensive guide cov...
4,5,This paper explores how to effectively adapt l...


### 5-4. Load Insight Table

In [91]:
# Load insight table
table_insight = pd.read_csv("insights.csv") # Change it to YOUR PATH
table_insight.head()

Unnamed: 0,PDF Name,Paper Name,Author Names,Short Description
0,attention.pdf,Attention Is All You Need,Ashish Vaswani,Attention Is All You Need Ashish Vaswani Goog...
1,Multimodal.pdf,Visual Instruction Tuning,"Haotian Liu1∗, Chunyuan Li2∗, Qingyang Wu3, Yo...","Visual Instruction Tuning Haotian Liu1∗, Chuny..."
2,Performance Evaluation.pdf,"A Multitask, Multilingual, Multimodal Evaluati...","on Reasoning, Hallucination, and Interactivity","A Multitask, Multilingual, Multimodal Evaluati..."
3,RAG Agent Resource-1.pdf,Part-I: What is an Agent?,Short Answer: Text-to-Task An LLM agent is an ...,Part-I: What is an Agent? Short Answer: Text-t...
4,Continual_Pretraining.pdf,ADAPTING LARGE LANGUAGE MODELS VIA,READING COMPREHENSION,ADAPTING LARGE LANGUAGE MODELS VIA READING COM...


## 6. Main function

In [96]:
## User selection
# User thesis selection before asking questions
pdf_paths_user_selected =  ["attention.pdf", "Multimodal.pdf", "Performance Evaluation.pdf", "RAG Agent Resource-1.pdf", "Continual_Pretraining.pdf", "Challenges LLM July 19_23.pdf", "llm_review 2.pdf", "cs224n-2023-lecture11-prompting-rlhf.pdf", "Mistral.pdf"] # Change it to YOUR PATH

In [100]:
# LLM for the main flow
client_main = Groq(
    api_key="YOUR_API_KEY",
)
#  LLM for keys(thesis/figure/table) extaction
client_extract = Groq(
    api_key="YOUR_API_KEY"
)
# LLM for HyDE
client_hyde = Groq(
    api_key="YOUR_API_KEY",
)


In [105]:
# Main

def chat_main(user_query):

  ## 1. extract keys from user's query and find figure/table description and summary description ##
  # generate prompt to extract keys from user's query
  prompt_extract_query = generate_prompt_extract_query(instruction_extract_query, user_query)
  # get keys from Extraction LLM
  response_keys = get_groq_response(client_extract, prompt_extract_query)

  # parse keys
  keys = parse_and_convert_keys(response_keys[0])

  # extract figure/table descriptions
  descriptions_figure_table = extract_descriptions(table_figure_table, keys)

  # extract thesis numbers from keys
  keys_thesis = extract_thesis_numbers(keys)
  # get summary descriptions
  descriptions_summary = get_descriptions_for_thesis_summary(keys_thesis, table_summary)

  ## 2. get context for HyDE and general RAG ##
  # add summary of the thesis as a context for HyDE
  context_hyde = descriptions_summary

  # create prompt for HyDE - ADDED summary table as a context for HyDE(not insight table as summary is more detai than insight)
  # prompt_hyde = generate_prompt_hyde(instruction_hyde, user_query, str(context_hyde))
  prompt_hyde = generate_prompt_hyde(instruction_hyde, user_query, str(context_hyde), str(table_summary))
  # get a hypothetical answer from HyDE LLM
  response_hyde = get_groq_response(client_hyde, prompt_hyde)

  ## 2. create contexts
  # initialize empty strings for contexts
  context_rag_hyde = ""
  context_rag_general = ""

  # thesis number lists(if user does not specify, then all thesis[1,....,9]) ONLY FOR RAG(NOT for figure/table descriptions)
  prompt_extract_thesis_numbers = generate_prompt_extract_thesis_numbers(instruction_thesis_numbers_extract_query, user_query)
  response_thesis_numbers = get_groq_response(client_extract, prompt_extract_thesis_numbers)
  keys_thesis_case2 = ast.literal_eval(response_thesis_numbers[0])

  # search for documents based on keys_thesis_cases2(thesis number extracted from user's query or all pdfs if user did not specify)
  # need to add error handling when len(keys_thesis_case2) > the number of PDFs
  if keys_thesis_case2 and all(key is not None for key in keys_thesis_case2):
      for key in keys_thesis_case2:
          if isinstance(key, int):
              adjusted_key = key - 1  # adjust the key by subtracting 1
              doc_source = pdf_paths_user_selected[adjusted_key]  # get the document source(FROM USER'S SELECTED LISTS) based on the adjusted key
              retriever = retrievers[doc_source]  # get the corresponding retriever

              # process query for RAG(Hyde)
              result_hyde = process_query(response_hyde[0], retriever)
              context_rag_hyde += f"Document {key}:\n{result_hyde}\n"

              # process query for RAG(General)
              result_general = process_query(user_query, retriever)
              context_rag_general += f"Document {key}:\n{result_general}\n"
  else:
      context_rag_hyde = ""
      context_rag_general = ""

  ## 3. get a final response ##
  # create prompt for a final response
  prompt_final = generate_prompt_final(instruction_final, user_query, str(descriptions_figure_table), context_rag_hyde, context_rag_general, str(table_insight))
  # get final response from main LLM
  # response_final = get_groq_response(client_main, prompt_final)
  response_final = get_response_claude(prompt = prompt_final)

  return response_final

## 7. Test questions and answers

In [106]:
user_query = 'What is object index?'
response = chat_main(user_query)
print(response[0])

Based on the provided context, the term "object index" is not explicitly defined or discussed in detail. However, I can provide some relevant information that may be related to the concept of object indexing in the context of language models and information retrieval:

1. In the context of retrieval-augmented language models, there is mention of using search indexes to store and retrieve relevant information. For example, from the "Challenges LLM July 19_23.pdf":

"We can decouple (i) memory storage of knowledge (e.g., databases or search indexes) and (ii) processing of the knowledge to arrive at a more modular architecture. For (i), a retriever module retrieves the top-k relevant documents (or passages) for a query from a large corpus of text."

2. In the context of visual tasks, there is mention of using API functions to locate and interact with objects. From the "Challenges LLM July 19_23.pdf":

"The Codex model is prompted with the query text and an API specification to do this. Th

In [107]:
user_query = 'What is the performance of LLaVa across across multiple image domains / subjects?'
response = chat_main(user_query)
print(response[0])

Based on the information provided in the context, LLaVA (Large Language and Vision Assistant) demonstrates strong performance across multiple image domains and subjects:

1. Out-of-domain performance: LLaVA is able to understand scenes and follow question instructions to provide reasonable responses even for images that are out of its training domain. This suggests good generalization capabilities across different visual domains (Multimodal.pdf).

2. Quantitative evaluation: A systematic evaluation was conducted to assess LLaVA's performance across various aspects, including accuracy, concept coverage, reasoning ability, and creativity (Multimodal.pdf).

3. Benchmark performance: LLaVA significantly outperforms other models like BLIP-2 and OpenFlamingo on visual instruction following tasks (Multimodal.pdf).

4. Emergent behavior: LLaVA demonstrates the ability to understand visual contents not covered in its training data. For example, it can recognize individuals like Elon Musk in dif

In [108]:
user_query = 'Can you provide a list of papers that have been processed?'
response = chat_main(user_query)
print(response[0])

Based on the information provided in the Summary Table, the following papers have been processed:

1. attention.pdf - "Attention Is All You Need" by Ashish Vaswani
2. Multimodal.pdf - "Visual Instruction Tuning" by Haotian Liu et al.
3. Performance Evaluation.pdf - "A Multitask, Multilingual, Multimodal Evaluation on Reasoning, Hallucination, and Interactivity"
4. RAG Agent Resource-1.pdf - "Part-I: What is an Agent?"
5. Continual_Pretraining.pdf - "ADAPTING LARGE LANGUAGE MODELS VIA READING COMPREHENSION"
6. Challenges LLM July 19_23.pdf - "Challenges and Applications of Large Language Models" by Jean Kaddour et al.
7. llm_review 2.pdf - "ANOVERVIEW ON LANGUAGE MODELS : RECENT DEVELOPMENTS AND OUTLOOK"
8. cs224n-2023-lecture11-prompting-rlhf.pdf - "Natural Language Processing with Deep Learning"
9. Mistral.pdf - "Mistral 7B" by Albert Q. Jiang et al.

These 9 papers cover a range of topics related to language models, including attention mechanisms, visual instruction tuning, performan

In [109]:
user_query = 'What is the largest GPT model provided in the papers? please cite.'
response = chat_main(user_query)
print(response[0])

Based on the information provided in the context, the largest GPT model mentioned is GPT-3 with 175 billion parameters. This is cited in multiple documents:

1. From the "Performance Evaluation.pdf":
"Large Language Models (LLMs) are language models with parameter sizes over a hundred billion, beginning with the introduction of GPT-3. Examples of LLMs include, but are not limited to, GPT-3, Gopher (Rae et al., 2021b), Megatron (Shoeybi et al., 2019), GPT-Jurassic (Lieber et al., 2021), OPT-175B Zhang et al. (2022)."

2. From "llm_review 2.pdf":
"GPT-3 2020 175B 45TB of text data $12 million"

3. From "cs224n-2023-lecture11-prompting-rlhf.pdf":
"GPT-3 (175B parameters; Brown et al., 2020)"

These sources consistently mention GPT-3 as having 175 billion parameters, which is the largest GPT model size provided in the given context.


In [112]:
user_query = 'Which paper covers how to process multiple data for RAG?'
response = chat_main(user_query)
print(response[0])

Based on the provided context, the paper that most directly covers how to process multiple data for RAG (Retrieval-Augmented Generation) is the "RAG Agent Resource-1.pdf". This document discusses advanced RAG techniques for handling multiple documents. Specifically:

1. It mentions "Advanced RAG – Multi Documents Agent with LlamaIndex" which is directly relevant to processing multiple data sources for RAG.

2. The document provides a sample implementation link for multi-document RAG using LlamaIndex: https://github.com/sugarforever/Advanced-RAG/blob/main/03_llama_index_multi_doc_agent.ipynb

3. It discusses the limitations of standard RAG when dealing with multiple documents, stating that RAG can't answer questions that require "Scanning, Comparing, and Reasoning across all documents in your knowledge base simultaneously."

4. The paper proposes an architecture to overcome these limitations:
   - Set up a "document agent" for each document that can do QA/summarization within its doc
  

In [110]:
user_query = 'Integrate the findings from efficient attention, continual pretraining, and performance evaluation. List the names of the authors'
response = chat_main(user_query)
print(response[0])

Based on the provided context, I can integrate the findings from efficient attention, continual pretraining, and performance evaluation, and list the relevant authors:

1. Efficient Attention:
- Ashish Vaswani et al. (authors of "Attention Is All You Need")
- Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher Ré (authors of "FlashAttention")

2. Continual Pretraining:
- The specific authors are not mentioned, but the concept is discussed in the Continual_Pretraining.pdf document. The approach involves converting raw texts into reading comprehension tasks and including general instructions to improve domain-specific knowledge while preserving prompting performance.

3. Performance Evaluation:
- Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (authors of "Measuring massive multitask language understanding")
- The Performance Evaluation.pdf document discusses evaluations of ChatGPT, but specific author names are not prov

In [111]:
user_query = 'What is continued pretraining and which paper focuses on this topic?'
response = chat_main(user_query)
print(response[0])

Based on the provided context, the paper that focuses on continued pretraining is "Continual_Pretraining.pdf". This paper investigates the concept of continued pre-training for large language models.

Continued pretraining, also referred to as domain-adaptive pretraining, involves further training a pre-trained language model on domain-specific corpora. The goal is to adapt the model to specific domains while leveraging its general abilities acquired during initial pre-training.

Key points about continued pretraining from this paper include:

1. It's an approach to adapt large language models to specific domains like biomedicine, finance, and law.

2. The authors found that continued training on domain-specific raw corpora can endow the model with domain knowledge, but it can also hurt the model's prompting ability.

3. To address this issue, they propose a method of converting raw corpora into reading comprehension texts for continued pre-training, which helps preserve prompting perf

## 8. Sample questions(on Canvas) and answers

In [113]:
user_query = 'What is the main hypothesis or research question addressed in the first academic article?'
response = chat_main(user_query)
print(response[0])

Based on the information provided in the Text Context from the "attention.pdf" document, the main hypothesis or research question addressed in the first academic article is:

The paper proposes a new network architecture called the Transformer, which is based solely on attention mechanisms, without using recurrence or convolutions for sequence transduction tasks.

This can be inferred from several key points in the context:

1. From the abstract (attention.pdf, chunk 1):
"We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely."

2. Further elaboration (attention.pdf, chunk 3):
"In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output."

3. The novelty of the approach is highlighted (attention.pdf, chunk 4):
"To the best of our knowledge, however, the Transf

In [114]:
user_query = 'Identify one key finding from the second academic article.'
response = chat_main(user_query)
print(response[0])

Based on the information provided in the Text Context from the Multimodal.pdf, one key finding from this academic article is:

LLaVA (Large Language and Vision Assistant) demonstrates impressive multimodal chat abilities and yields a 85.1% relative score compared to GPT-4 on a synthetic multimodal instruction-following benchmark.

This finding is significant because it shows that LLaVA, the model developed in this study, performs well in tasks that combine visual and language understanding. The high relative score compared to GPT-4, which is considered a state-of-the-art model, indicates that LLaVA is capable of handling complex multimodal tasks effectively.


In [115]:
user_query = 'Summarize the methodology used in the third academic article. Highlight any unique approaches or techniques employed.'
response = chat_main(user_query)
print(response[0])

Based on the information provided in the Text Context from the "Performance Evaluation.pdf", I can summarize the methodology used in this academic article as follows:

1. Multitask Evaluation: The researchers conducted experiments on ChatGPT using samples from standard public test sets across various NLP tasks. These tasks included question answering, reasoning, summarization, machine translation, automatic post-editing, sentiment analysis, and language identification.

2. Multilingual Evaluation: The study assessed ChatGPT's performance across multiple languages, though specific details about this aspect are not provided in the given context.

3. Multimodal Evaluation: While mentioned in the title, specific details about the multimodal evaluation are not present in the provided context.

4. Interactive Evaluation: The researchers explored the impact of multi-turn interactivity on performance in various NLP tasks. This approach showed significant improvements in some areas, such as an 

In [116]:
user_query = 'From the images and figures in the first article, describe the trend shown in Figure 2. What does it indicate about the research findings?'
response = chat_main(user_query)
print(response[0])

Based on the Figure/Table Context provided, I can describe the trend shown in Figure 2 of the paper "Attention Is All You Need" (attention.pdf). 

Figure 2 illustrates two key components of the Transformer architecture: Scaled Dot-Product Attention and Multi-Head Attention.

The left side of Figure 2 shows the Scaled Dot-Product Attention mechanism. The trend it indicates is a sequential flow of operations:

1. It starts with input matrices Q (Query), K (Key), and V (Value).
2. These inputs go through a series of operations:
   - Matrix multiplication (MatMul) of Q and K
   - Scaling
   - Optional masking
   - Softmax operation
   - Another matrix multiplication with V

The right side of Figure 2 depicts Multi-Head Attention, which shows multiple attention mechanisms operating in parallel.

This figure indicates several key research findings:

1. The Transformer model uses a novel attention mechanism that doesn't rely on recurrent or convolutional neural networks.

2. The attention mec

In [117]:
user_query = 'Critically evaluate the statistical methods used in the first article. Are there any limitations or strengths worth noting?'
response = chat_main(user_query)
print(response[0])

Based on the information provided in the text context from attention.pdf, I can critically evaluate the statistical methods used in the article "Attention Is All You Need" by Ashish Vaswani et al. Here are the key points:

1. Performance Metric:
The primary statistical method used to evaluate the model's performance is the BLEU score. This is a standard metric in machine translation tasks, which allows for comparison with other models. However, it's worth noting that while BLEU is widely used, it has limitations in capturing the full quality of translations.

Strength: Using a standardized metric allows for direct comparison with other models in the field.
Limitation: BLEU scores alone may not capture all aspects of translation quality, such as fluency or semantic accuracy.

2. Comparative Analysis:
The authors compare their model's performance against previous state-of-the-art models, including both single models and ensembles. This provides a clear benchmark for evaluating the Transf

In [118]:
user_query = 'Integrate the findings from the three articles to propose a new research direction or hypothesis. Justify your proposal based on the evidence provided in the articles.'
response = chat_main(user_query)
print(response[0])

Based on the findings from the three articles, I propose a new research direction that integrates multimodal learning, attention mechanisms, and advanced reasoning capabilities in large language models. This proposal aims to develop more versatile and intelligent AI systems capable of handling complex tasks across different modalities while maintaining interpretability and efficiency. Here's the justification based on the evidence provided:

1. Multimodal Integration:
The "Visual Instruction Tuning" paper (Multimodal.pdf) demonstrates the potential of combining visual and language models to create more capable AI systems. They showed that instruction tuning using multimodal data can improve zero-shot capabilities on new tasks. Building on this, we can explore ways to seamlessly integrate multiple modalities (text, image, audio, video) into a single model architecture.

2. Advanced Attention Mechanisms:
The "Attention Is All You Need" paper (attention.pdf) introduced the transformer arc