In [1]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
    PromptTemplate,
)
from langchain.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from pprint import pp
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
from utils import secrets

In [2]:
llm = ChatOpenAI(
    model="gpt-4-turbo-preview",
    api_key=secrets.get("OPENAI_API_KEY"),
    max_tokens=4096,
    temperature=0.0,
    model_kwargs={"response_format": {"type": "json_object"}},
)
embedder = OpenAIEmbeddings(openai_api_key=secrets.get("OPENAI_API_KEY"), model="text-embedding-3-large")

In [None]:
top_n = 10
dataset_name = "medquad"

In [None]:
df = pd.read_csv(f"{dataset_name}_ambiguous_with_top{top_n}_merged.csv", index_col=0)
n_queries = len(df)

## Taxonomy Example Query Generation

In [None]:
from models import Queries


output_parser = PydanticOutputParser(pydantic_object=Queries)

In [None]:
sys_message = """User is developing a taxonomy using Nickerson method. His meta-characteristic is "Large Language Model's interaction with the environment through the exchange of questions and answers, independent of internal knowledge or external actions beyond providing information or insights in response to queries.".

He wants you to perform some tasks. Your task is to satisfy the needs of him as good as possible."""

user_message = """I want you to give me {number_of_queries} different queries from mostly office/business kinds of scenarios/needs which an office worker or manager may face. Queries may be a question, a request, a task to be performed by a human. They need to be non-physical things as it is not allowed by the meta-characteristic. They should be diverse and apply to many kinds of scenarios that people can face in business. They may or may not include but definitely not limited to the question_categories provided below. They must be longer than {greater_than} words and less than {less_than} words.

question_categories = {{
    "Subject": ["Math", "History", "Physics", "Linguistic", "Science", "Finance"],
    "Style": ["Factual", "Generative", "Analysis", "Evaluation"],
    "Complexity": ["Simple", "Complex", "Multi-step"],
    "Answer Format": ["True/False", "Short Answer", "Long answer"],
    "Context": ["Context-dependent", "Context-independent"],
    "Reasoning": ["Deductive", "Inductive", "Abductive"]
    "Ambiguity": ["Implicit", "Explicit"]
}}

They should be different from already existing ones that are provided below.
{previous_generated_queries}

Be careful about providing {number_of_queries} **diverse** and **unique** queries with more than **{greater_than}** words and less than **{less_than}** words.
{format_instructions}"""

In [None]:
messages = [
    SystemMessagePromptTemplate(prompt=PromptTemplate(template=sys_message, input_variables=[])),
    HumanMessagePromptTemplate(
        prompt=PromptTemplate(
            template=user_message,
            input_variables=["number_of_queries", "greater_than", "less_than", "previous_generated_queries"],
            partial_variables={"format_instructions": output_parser.get_format_instructions()},
        )
    ),
]
prompt = ChatPromptTemplate.from_messages(messages=messages)

In [None]:
chain = prompt | llm | output_parser

In [None]:
questions = []

In [None]:
lengths = [5, 15, 25, 35, 45]
for i in tqdm(range(len(lengths))):
    resp = chain.invoke(
        {
            "number_of_queries": 20,
            "greater_than": lengths[i],
            "less_than": lengths[i] + 10,
            "previous_generated_queries": "\n".join([q for _, q in questions]),
        }
    )
    questions.extend(resp.questions)

100%|██████████| 5/5 [02:25<00:00, 29.10s/it]


In [None]:
len(questions)

100

In [None]:
questions

In [None]:
df = pd.DataFrame(data=questions, columns=["length_index", "question"])

In [None]:
df = df.drop(columns=["length_index"])
df["length"] = df["question"].apply(len)

In [None]:
df

In [None]:
df.to_csv("ai-generated-queries.csv")

## Classifier

In [None]:
df = pd.read_csv("ai-generated-queries.csv", index_col=0)

In [None]:
system_prompt = """User will give you a query and your task is to classify it with respect to following class descriptions.

{class_description}

Use class labels as in the square brackets, and output only the label within those square brackets, not something else. Just pure string which is the labels provided above. Do not try to chat, welcome, etc. the user in any way. Do not interact with him.
"""

In [None]:
user_message = """Query: {query}
Answer:"""

In [None]:
datatype = """Datatype: Two labels as [SIMP] and [COMP].
1. [SIMP] Simple Datatypes: Can be any of the following types.
    - Boolean: Questions that require a straightforward "yes" or "no" answer or a binary response.
    - Integer: Questions that expect numerical values without decimal points or fractional components.
    - Float: Questions that require numerical values with decimal points or fractional components.
    - String: Questions that call for textual responses or character-based information.
2. [COMP] Compound Datatypes: Can be any of the following types.
    - List/Array: Questions that necessitate responses containing multiple items or a collection of values.
    - Dictionary/Map: Questions that require structured responses with key-value pairs or associative data.
    - Object/Structure: Questions that expect responses formatted as complex data structures or objects with attributes and properties.
    - Multimedia: Queries involving audio, video, or other multimedia formats.
    - and any combination of above."""

In [None]:
contextual_sensitivity = """Contextual Sensitivity: Three labels as [IND], [DEP], and [SENS].
- [IND] Context Independent: These questions typically seek information about general concepts or fundamental principles that remain consistent over time/context. The context in which they're asked may change, but the core concepts remain unchanged.
- [DEP] Context Dependent: Questions falling under this category involve topics subject to periodic changes or updates according the context they are in, yet the core elements of the answer remain stable. The context may shift over time, requiring occasional updates to reflect evolving standards or practices.
- [SENS] Context Sensitive: These questions pertain to topics experiencing frequent or unpredictable changes with respect to different contexts, demanding continuous updates to the answer to keep pace with shifting conditions or emerging trends. The context surrounding these questions is dynamic and requires ongoing monitoring.
"""

In [None]:
query_understanding = """Query Understanding Classification: Two labels as [EXP] and [IMP].
- [EXP] Explicit: This category is reserved for queries that are articulated with precise language and directly reference specific concepts or entities relevant to the user's query. The queries under this classification convey the information request in a straightforward manner, enabling a direct response without the need for inferring context or deciphering ambiguous references. The focus is on the clarity and directness of the query's phrasing, ensuring immediate understanding without extrapolation.
- [IMP] Implicit: Implicit queries are less clear or direct, requiring interpretation of the context to provide a relevant response. This classification is applied to queries that are not directly articulated, often involving references to unknown concepts or entities that are not explicitly mentioned. Such queries require the Large Language Model to infer the missing pieces or to understand the context of the query without direct guidance from the user. The challenge lies in identifying and understanding these elements that are implied rather than clearly stated, to fulfill the query's requirements.
"""

In [None]:
retrieval_synthesis = """Retrieval & Synthesis Dimension: Three labels as [NOR], [SIR], [SRI]
1. [NOR] No Retrieval Inquiry: Interactions where the LLM relies solely on its internal knowledge base or generative capabilities to respond to queries. This involves situations where the LLM does not need to access or retrieve information from specific data sources or synthesize across different pieces of information. Instead, it can generate responses based on the patterns, concepts, and information it has learned during its training phase. This could involve generating creative content, predicting outcomes based on learned patterns, or applying generalized knowledge to hypothesize answers for hypothetical scenarios. This category represents the model's ability to utilize learned information without the explicit need for current, real-world data retrieval. These interactions showcase the model's capacity for abstraction, creativity, and application of general knowledge in generating responses.
2. [SIR] Simple Retrieval Inquiry: Queries involving straightforward retrieval of information without complex search or processing requirements. Simple Retrieval Inquiries involve accessing information that can be readily retrieved from one or more sources without processing. These inquiries typically pertain to factual information or well-defined data points that are easily accessible. Information can be readily retrieved from one or more sources. (think of it as a puzzle pieces, that can fit each other without any effort) This involves instances where the LLM fetches and delivers information directly as it is stored in its data sources, without needing to reframe, interpret, or significantly process the information beyond recognizing the inquiry's context. This is akin to quoting facts or data directly from the dataset. Such inquiries do not require the model to understand or synthesize information across contexts — it is essentially retrieving and presenting information as-is.
3. [SRI] Synthetic Retrieval Inquiry: Queries requiring advanced search techniques, multiple sequential steps, or processing to retrieve relevant information effectively. Synthetic Retrieval Inquiries involve accessing information that requires synthesis or integration of data from multiple sources or perspectives to generate a comprehensive response. These inquiries often involve complex concepts, interrelated data points, or nuanced interpretations that cannot be easily obtained from a single source. Synthesis of information to provide a comprehensive response. (think of it as wood, to make a table you need to process it) This category applies when the LLM must contextually process information from different segments or times, integrate diverse information or data sources, or apply reasoning to generate a response. This goes beyond mere retrieval, involving understanding contexts, drawing inferences, or synthesizing information in ways that werent explicitly outlined in the source material. This can include cross-referencing facts, identifying relationships between concepts, or integrating knowledge from various contexts to develop a comprehensive, coherent response.
"""

In [None]:
intent = """Query Intent: Six labels as [INF], [CONV], [DIV], [INS], [GEN], [COMP].
  1. [INF] Informational: Encompasses seeking factual information, definitions, and explanations. This category is for queries that aim directly at understanding specific facts or data.
    Example: "What is the population of Canada?”
  2. [CONV] Analytical/Evaluative (Convergent Thinking): This merged category is for queries that involve dissecting complex information or data to understand underlying patterns, reasons, or to make informed judgements. Queries in this category seek a deeper understanding that supports decision-making, comparison of options, or evaluating alternatives. The process involves both analysis (to break down and understand) and evaluation (to assess, compare, and decide), acknowledging that these processes often occur in tandem. This category involves queries where the goal is to converge upon a specific answer or decision through analysis and evaluation.
    Example: "Which laptop model offers the best performance for graphic design applications within a $1500 budget?”
  3. [DIV] Exploratory(Divergent Thinking): Queries that are inherently about seeking new areas of knowledge, understanding emerging trends, or identifying unknown opportunities without a specific end goal of making a decision or judgment. Exploratory queries are characterized by their openness and the lack of a predefined objective, differentiating them from evaluative queries which are directed towards forming a judgment or assessment. Exploratory queries encourage branching out into various directions to explore a wide array of possibilities without necessarily aiming to converge on a single answer or outcome.
    Example: "What are the emerging trends in renewable energy technology?”
  4. [INS] Instructional: Queries specifically seeking step-by-step guidance, instructions, or procedures to perform a task. it focuses on "how" to do something rather than "what," "why," or "which.
    Example: "How do I change a car tire?”
  5. [GEN] Generative: Generative queries should be distinctly categorized to emphasize their creative and output-generating nature. Unlike exploratory queries that ***diverge*** in the search for new knowledge or trends, generative queries specifically seek the creation of new content, ideas, or solutions. Generative queries implicate a divergent thought process focused on originality and creation, clearly setting it apart from exploratory intent.
    Example: "Write a blog post on Argentinian coffee”
  6. [COMP] Composite: This category includes queries that do not fit neatly into the above categories or have unique/multiple intents not covered by the previous categories.
    Example: "Considering the current trends in climate change, what are some sustainable business opportunities for the next decade, and how can one get started in these areas?"
"""

In [None]:
messages = [
    SystemMessagePromptTemplate(prompt=PromptTemplate(template=system_prompt, input_variables=["class_description"])),
    HumanMessagePromptTemplate(prompt=PromptTemplate(template=user_message, input_variables=["query"])),
]
prompt = ChatPromptTemplate.from_messages(messages=messages)

In [None]:
chain = prompt | llm

In [None]:
for cls_name, cls in [
    ("datatype", datatype),
    ("query_understanding", query_understanding),
    ("intent", intent),
    ("retrieval_synthesis", retrieval_synthesis),
    ("contextual_sensitivity", contextual_sensitivity),
]:
    for row in df.iterrows():
        query = row[1]["question"]
        answer = chain.invoke({"class_description": cls, "query": query})
        df.loc[row[0], f"label_{cls_name}"] = answer.content
        print(answer.content)

In [None]:
df

In [None]:
df.to_csv("labeled_queries.csv")

In [None]:
df["label_datatype"].unique()

array(['[COMP]', '[SIMP]', '[STRING]'], dtype=object)

In [None]:
df["label_query_understanding"].unique()

array(['[IMP]', '[EXP]'], dtype=object)

In [None]:
df["label_intent"].unique()

array(['[GEN]', '[CONV]', '[DIV]', '[INS]', '[INF]', '[COMP]'],
      dtype=object)

In [None]:
df["label_retrieval_synthesis"].unique()

array(['[NOR]', '[SRI]', '[SIR]'], dtype=object)

In [None]:
df["label_contextual_sensitivity"].unique()

array(['[SENS]', '[DEP]', '[IND]'], dtype=object)

In [None]:
df

Unnamed: 0,question,length,label_datatype,label_query_understanding,label_intent,label_retrieval_synthesis,label_contextual_sensitivity
0,Calculate quarterly sales projections.,38,[COMP],[IMP],[GEN],[NOR],[SENS]
1,Draft a new marketing strategy outline.,39,[COMP],[IMP],[GEN],[NOR],[DEP]
2,Compare last year's performance report.,39,[COMP],[IMP],[CONV],[SRI],[SENS]
3,Summarize findings from customer feedback surv...,50,[COMP],[IMP],[CONV],[SRI],[SENS]
4,Identify trends in recent market research.,42,[COMP],[IMP],[DIV],[SRI],[SENS]
...,...,...,...,...,...,...,...
95,Given the increasing importance of data securi...,192,[COMP],[IMP],[CONV],[SRI],[SENS]
96,Can you investigate the impact of recent regul...,165,[COMP],[IMP],[CONV],[SRI],[SENS]
97,Looking at our sales team's performance metric...,182,[COMP],[IMP],[CONV],[SRI],[SENS]
98,Can you assess the effectiveness of our intern...,177,[COMP],[IMP],[CONV],[SRI],[DEP]
