# Extract thesis/figure/table numbers from user's query and search descriptions based on numbers

### Setup LLM

In [2]:
# pip install groq

In [53]:
from groq import Groq

def get_groq_response(client, prompt, model="llama3-70b-8192", max_tokens=2048, temperature=0.0):
    """
    Generates a response using the provided client, model, prompt, and specified parameters.

    Args:
        client: The client object to interact with the API.
        prompt (str): The prompt to generate a response for.
        model (str, optional): The model identifier to use for generating the response. Default is "llama3-70b-8192".
        max_tokens (int, optional): The maximum number of tokens for the generated response. Default is 2048.
        temperature (float, optional): The temperature setting for the response generation. Default is 0.0.

    Returns:
        tuple: The generated response content and usage statistics.
    """
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
            max_tokens=max_tokens,
            temperature=temperature
        )
        return chat_completion.choices[0].message.content, chat_completion.usage
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

In [55]:
client = Groq(
    api_key="YOUR_API_KEY",
)

In [73]:
prompt = "Hello"
response = get_groq_response(client, prompt)
print(response[0])

Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?


### Extract thesis and figure/table numbers from user's query

In [96]:
instruction_extract_query = """
### Instructions ###
You are an NLP engineer. Your task is to extract the "numbers" from the user's query below.
The "numbers" mean which academic paper the user is referring to, 2) which figure the user is referring to, and 3) which table the user is referring to.
There may be cases where all, some, or none of these are specified. Enter the number only for the specified fields, and return an empty string "" for fields that are not specified. Interpret "figure" for terms such as "Chart," "Diagram," or "Image."
Please provide your response as a list of objects, each containing thesis, figure, and table.　Please provide your response strictly in the specified format, without including any additional text for formatting. I will use your response directly.
If it is unclear which thesis, figure, or table is being referred to, it is okay to return an empty string. Please do not make any assumptions.

### Output Format ###
Format: a list of objects

### Example user's query1 ###
What is the main hypothesis or research question addressed in the first academic article?

### Example Output1 ###
[
  {
  "thesis": "1",
  "figure": "",
  "table": ""
  }
]

### Example user's query2 ###
Summarize the methodology used in the third academic article. Highlight any unique approaches or techniques employed.

### Example Output2 ###
[
  {
  "thesis": "3",
  "figure": "",
  "table": ""
  }
]


### Example user's query3 ###
Q. From the images and figures in the second article, describe the trend shown in Figure 2. What does it indicate about the research findings?

### Example Output3 ###
[
  {
  "thesis": "2",
  "figure": "2",
  "table": ""
  }
]

### Example user's query4 ###
Q. What can be understood from Image 3 in the third paper?

### Example Output4 ###
[
  {
  "thesis": "3",
  "figure": "3",
  "table": ""
  }
]

### Example user's query4 ###
Q. Please explain Figure 3 and Table 2 of the second academic paper. What do these indicate about the research findings?

### Example Output4 ###
[
  {
  "thesis": "2",
  "figure": "3",
  "table": ""
  },
  {
  "thesis": "2",
  "figure": "",
  "table": "2"
  }
]

### Example user's query5 ###
Q. Please compare table 3 and chart 4 from the second and third theses, respectively.

### Example Output5 ###
[
  {
  "thesis": "2",
  "figure": "",
  "table": "3"
  },
  {
  "thesis": "3",
  "figure": "4",
  "table": ""
  }
]

### Example user's query6 ###
Do you like an apple?

### Example Output6 ###
[
  {
  "thesis": "",
  "figure": "",
  "table": ""
  }
]


### User’s query ###
{USER_QUERY}

### Output ###
"""



In [75]:
def generate_prompt_extract_query(instruction, user_query):
    """
    Generates a prompt for extracting keys from the user's query by replacing placeholders in the instruction template.

    Args:
        instruction (str): The template instruction containing a placeholder for the user's query.
        user_query (str): The user's query to be inserted into the instruction.

    Returns:
        str: The generated instruction with the placeholder replaced by the user's query.
    """
    instruction = instruction.replace("{USER_QUERY}", user_query)
    return instruction

In [82]:
user_query = 'Critically evaluate the statistical methods used in the first article. Are there any limitations or strengths worth noting?'
# generate prompt to extract keys from user's query
prompt_extract_query = generate_prompt_extract_query(instruction_extract_query, user_query)
# get keys
response = get_groq_response(client, prompt_extract_query)
print(response[0])

[
  {
  "thesis": "1",
  "figure": "",
  "table": ""
  }
]


In [83]:
user_query = 'What is the relationship between chart2 in thesis3 and table4 in the first thesis?'
# generate prompt to extract keys from user's query
prompt_extract_query = generate_prompt_extract_query(instruction_extract_query, user_query)
# get keys
response = get_groq_response(client, prompt_extract_query)
print(response[0])

[
  {
  "thesis": "3",
  "figure": "2",
  "table": ""
  },
  {
  "thesis": "1",
  "figure": "",
  "table": "4"
  }
]


In [84]:
user_query = 'Please make the findings brief'
# generate prompt to extract keys from user's query
prompt_extract_query = generate_prompt_extract_query(instruction_extract_query, user_query)
# get keys
response = get_groq_response(client, prompt_extract_query)
print(response[0])

[
  {
  "thesis": "",
  "figure": "",
  "table": ""
  }
]


In [87]:
import json

def parse_and_convert_keys(json_string):
    """
    Parse the JSON string and convert the string values in the keys list to their appropriate types.

    Args:
    json_string (str): A JSON string representing a list of dictionaries with string values for 'thesis', 'figure', and 'table'.

    Returns:
    list: A list of dictionaries with 'thesis' as int, and 'figure' and 'table' as int or None.
    """
    try:
        keys = json.loads(json_string)
        if not keys:
            return []

        converted_keys = []
        for key in keys:
            converted_key = {
                "thesis": int(key["thesis"]) if key["thesis"] else None,
                "figure": int(key["figure"]) if key["figure"] else None,
                "table": int(key["table"]) if key["table"] else None
            }
            converted_keys.append(converted_key)
        return converted_keys
    except json.JSONDecodeError as e:
        print(f"JSON decoding error: {e}")
        return []
    except KeyError as e:
        print(f"Missing key in JSON data: {e}")
        return []
    except ValueError as e:
        print(f"Value error: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

In [126]:
# parse keys
user_query = 'Critically evaluate the statistical methods used in the first article. Are there any limitations or strengths worth noting?'
prompt_extract_query = generate_prompt_extract_query(instruction_extract_query, user_query)
response = get_groq_response(client, prompt_extract_query)
keys = parse_and_convert_keys(response[0])
print(keys)

[{'thesis': 1, 'figure': None, 'table': None}]


In [128]:
# parse keys
user_query = 'What is the relationship between chart2 in thesis3 and table4 in the first thesis?'
prompt_extract_query = generate_prompt_extract_query(instruction_extract_query, user_query)
response = get_groq_response(client, prompt_extract_query)
keys = parse_and_convert_keys(response[0])
print(keys)

[{'thesis': 3, 'figure': 2, 'table': None}, {'thesis': 1, 'figure': None, 'table': 4}]


In [117]:
# parse keys
user_query = 'Please make the findings brief'
prompt_extract_query = generate_prompt_extract_query(instruction_extract_query, user_query)
response = get_groq_response(client, prompt_extract_query)
keys = parse_and_convert_keys(response[0])
print(keys)

[{'thesis': None, 'figure': None, 'table': None}]


### Find the description from the table

In [64]:
import pandas as pd

In [65]:
table_sample = [
    {"thesis_num": 1, "figure_num": 1, "table_num": None, "description": "This is description for figure1 in thesis 1"},
    {"thesis_num": 1, "figure_num": 2, "table_num": None, "description": "This is description for figure2 in thesis 1"},
    {"thesis_num": 1, "figure_num": 3, "table_num": None, "description": "This is description for figure3 in thesis 1"},
    {"thesis_num": 1, "figure_num": 4, "table_num": None, "description": "This is description for figure4 in thesis 1"},
    {"thesis_num": 2, "figure_num": 1, "table_num": None, "description": "This is description for figure1 in thesis 2"},
    {"thesis_num": 2, "figure_num": 2, "table_num": None, "description": "This is description for figure2 in thesis 2"},
    {"thesis_num": 2, "figure_num": 3, "table_num": None, "description": "This is description for figure3 in thesis 2"},
    {"thesis_num": 2, "figure_num": None, "table_num": 1, "description": "This is description for table1 in thesis 2"},
    {"thesis_num": 2, "figure_num": None, "table_num": 2, "description": "This is description for table2 in thesis 2"},
    {"thesis_num": 2, "figure_num": None, "table_num": 3, "description": "This is description for table3 in thesis 2"},
    {"thesis_num": 3, "figure_num": 1, "table_num": None, "description": "This is description for figure1 in thesis 3"},
    {"thesis_num": 3, "figure_num": 2, "table_num": None, "description": "This is description for figure2 in thesis 3"},
    {"thesis_num": 3, "figure_num": None, "table_num": 1, "description": "This is description for table1 in thesis 3"},
    {"thesis_num": 3, "figure_num": None, "table_num": 2, "description": "This is description for table2 in thesis 3"},
    {"thesis_num": 3, "figure_num": None, "table_num": 3, "description": "This is description for table3 in thesis 3"},
    {"thesis_num": 4, "figure_num": 1, "table_num": None, "description": "This is description for figure1 in thesis 4"},
    {"thesis_num": 4, "figure_num": 2, "table_num": None, "description": "This is description for figure2 in thesis 4"},
    {"thesis_num": 4, "figure_num": 3, "table_num": None, "description": "This is description for figure3 in thesis 4"},
    {"thesis_num": 4, "figure_num": None, "table_num": 1, "description": "This is description for table1 in thesis 4"},
    {"thesis_num": 4, "figure_num": None, "table_num": 2, "description": "This is description for table2 in thesis 4"},
    {"thesis_num": 4, "figure_num": None, "table_num": 3, "description": "This is description for table3 in thesis 4"}
]

table_sample  = pd.DataFrame(table_sample)
table_sample

Unnamed: 0,thesis_num,figure_num,table_num,description
0,1,1.0,,This is description for figure1 in thesis 1
1,1,2.0,,This is description for figure2 in thesis 1
2,1,3.0,,This is description for figure3 in thesis 1
3,1,4.0,,This is description for figure4 in thesis 1
4,2,1.0,,This is description for figure1 in thesis 2
5,2,2.0,,This is description for figure2 in thesis 2
6,2,3.0,,This is description for figure3 in thesis 2
7,2,,1.0,This is description for table1 in thesis 2
8,2,,2.0,This is description for table2 in thesis 2
9,2,,3.0,This is description for table3 in thesis 2


In [118]:
def extract_descriptions(df, keys):
    """
    Extract and format descriptions from the dataframe based on the provided keys.

    Args:
    df (DataFrame): The dataframe containing thesis, figure, table, and description data.
    keys (list): A list of dictionaries with 'thesis' as int, and 'figure' and 'table' as int or None.

    Returns:
    list: A list of formatted descriptions corresponding to the provided keys.
    """
    formatted_descriptions = []

    for key in keys:
        thesis_num = key["thesis"]
        figure_num = key["figure"]
        table_num = key["table"]

        if figure_num is not None:
            description = df[(df["thesis_num"] == thesis_num) & (df["figure_num"] == figure_num)]["description"].values
            prefix = f"thesis{thesis_num} figure{figure_num} description: "
        elif table_num is not None:
            description = df[(df["thesis_num"] == thesis_num) & (df["table_num"] == table_num)]["description"].values
            prefix = f"thesis{thesis_num} table{table_num} description: "
        else:
            description = []
            prefix = ""

        if len(description) > 0:
            formatted_descriptions.append(prefix + description[0])
        else:
            formatted_descriptions.append(prefix + "Description not found")

    return formatted_descriptions

In [129]:
# extract descriptions
descriptions = extract_descriptions(table_sample, keys)
print(descriptions)

['thesis3 figure2 description: This is description for figure2 in thesis 3', 'thesis1 table4 description: Description not found']


### Get thesis number and find summary

In [120]:
def extract_thesis_numbers(converted_keys):
    """
    Extracts the thesis numbers from a list of dictionaries.

    Args:
    converted_keys (list): A list of dictionaries with 'thesis', 'figure', and 'table' keys.

    Returns:
    list: A list of thesis numbers.
    """
    try:
        thesis_numbers = [item['thesis'] for item in converted_keys]
        return thesis_numbers
    except Exception as e:
        # print(f"An error occurred while extracting thesis numbers: {e}")
        return []

In [130]:
# extract thesis numbers from keys
keys_thesis = extract_thesis_numbers(keys)
print(keys_thesis)

[3, 1]


In [70]:
# Creating table_sample_summary with thesis_num and description
table_sample_summary = pd.DataFrame([
    {"thesis_num": 1, "description": "This is summary description in thesis 1"},
    {"thesis_num": 2, "description": "This is summary description in thesis 2"},
    {"thesis_num": 3, "description": "This is summary description in thesis 3"},
    {"thesis_num": 4, "description": "This is summary description in thesis 4"}
])

table_sample_summary

Unnamed: 0,thesis_num,description
0,1,This is summary description in thesis 1
1,2,This is summary description in thesis 2
2,3,This is summary description in thesis 3
3,4,This is summary description in thesis 4


In [122]:
def get_descriptions_for_thesis_summary(thesis_numbers, table_summary):
    """
    Retrieves the descriptions for the given thesis numbers from the table_summary DataFrame.

    Args:
    thesis_numbers (list): A list of thesis numbers.
    table_summary (pd.DataFrame): The DataFrame containing thesis numbers and their descriptions.

    Returns:
    list: A list of descriptions corresponding to the thesis numbers, formatted to indicate which thesis each description belongs to.
    """
    try:
        result = []
        for thesis_num in thesis_numbers:
            description = table_summary.loc[table_summary['thesis_num'] == thesis_num, 'description'].values[0]
            result.append(f"Summary description for thesis {thesis_num}: '{description}'")
        return result
    except Exception as e:
        # print(f"An error occurred while retrieving descriptions: {e}")
        return []

In [131]:
# get summary
descriptions_summary = get_descriptions_for_thesis_summary(keys_thesis, table_sample_summary)
print(descriptions_summary)

["Summary description for thesis 3: 'This is summary description in thesis 3'", "Summary description for thesis 1: 'This is summary description in thesis 1'"]
