In [1]:
import sys
sys.path.append('..')
import os
from importlib import reload

from utils import qa_advanced_scorer, qa_basic_scorer

import json
import pandas as pd

from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser

data = pd.read_json('train.json')

In [2]:
# display(data.loc[data['qa'].isna()]['qa_0'].iloc[0])
# display(data.loc[data['qa'].isna()]['qa_1'].iloc[0])
# display(data.loc[data['qa'].isna()]['annotation'].iloc[0])

In [3]:
# display(data.head(1)['qa'][0]['question'])
# display(data.head(1)['annotation'][0])

In [2]:
import math

def get_source_text(x):
    pre_text = '\n'.join(x['pre_text'])
    table = x['table']
    post_text = '\n'.join(x['post_text'])
    return f"{pre_text}\n\ntable:\n\n{table}\n\n{post_text}"

def split_qa(x):
    qa = x['qa']
    qa_0 = x['qa_0']
    qa_1 = x['qa_1']

    qa_list = []

    if str(qa) != 'nan':
        qa_list.append(qa)
    else:
        if str(qa_0) != 'nan':
            qa_list.append(qa_0)
        if str(qa_1) != 'nan':
            qa_list.append(qa_1)
    
    return qa_list

def load_data():
    data = pd.read_json('train.json')
    data['source_text'] = data.apply(get_source_text, axis=1)
    data['qa_exploded'] = data.apply(split_qa, axis=1)
    data = data.explode('qa_exploded', ignore_index=True)
    data['question'] = data['qa_exploded'].apply(lambda x: x['question'] if str(x)!='nan' else None)
    data['answer'] = data['qa_exploded'].apply(lambda x: x['answer'] if str(x)!='nan' else None)
    return data

data_clean = load_data()

In [5]:
print(data_clean.shape)
data_clean.head(2)

(3965, 14)


Unnamed: 0,pre_text,post_text,filename,table_ori,table,qa,id,annotation,qa_0,qa_1,source_text,qa_exploded,question,answer
0,"[26 | 2009 annual report in fiscal 2008 , reve...","[year ended june 30 , cash provided by operati...",JKHY/2009/page_28.pdf,"[[, Year ended June 30, 2009], [2008, 2007], [...","[[2008, year ended june 30 2009 2008, year end...",{'question': 'what was the percentage change i...,Single_JKHY/2009/page_28.pdf-3,{'amt_table': '<table class='wikitable'><tr><t...,,,"26 | 2009 annual report in fiscal 2008 , reven...",{'question': 'what was the percentage change i...,what was the percentage change in the net cash...,14.1%
1,[substantially all of the goodwill and other i...,[the above unaudited pro forma financial infor...,RSG/2008/page_114.pdf,"[[, Year Ended December 31, 2008 (Unaudited), ...","[[, year ended december 31 2008 ( unaudited ),...",{'question': 'what was the percent of the grow...,Single_RSG/2008/page_114.pdf-2,{'amt_table': '<table class='wikitable'><tr><t...,,,substantially all of the goodwill and other in...,{'question': 'what was the percent of the grow...,what was the percent of the growth in the reve...,1.3%


## Simple RAG

In [243]:
def qa_simple_rag(text,question):

    llm = AzureChatOpenAI(
        api_key=os.getenv('AZURE_OPENAI_API_KEY'),
        azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
        api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
        temperature=0.15,
        max_tokens=None,
        timeout=None,
        # max_retries=3,
    )

    prompt_str = """
    You are reading a financial document. 
    Provide an answer to the following question based on the information provided in the text.
    Give the answer only.
    \n\n{text}\n\n
    Question: {question}
    """

    prompt_template = PromptTemplate(
        template=prompt_str,
        input_variables=["text", "question"],
    )

    chain = prompt_template | llm | StrOutputParser()

    response = chain.invoke({
        'text': text,
        'question':question
    })

    return response

qa_simple_rag(text,question)
# qa_simple_rag('in 2022, the company made 200 pounds after spending 100','what is the total profit made in 2022?')

'$12.0 million'

## RAG With Math Tool

#### (LLMathChain is deprecated...)

In [6]:
from langchain_openai import OpenAI
from langchain.chains import LLMMathChain, LLMChain
from langchain.prompts import PromptTemplate
from langchain.agents.agent_types import AgentType
from langchain.agents import Tool, initialize_agent
from langchain_experimental.utilities import PythonREPL
import numexpr

def qa_maths_reasoning(text,question):

    llm = AzureChatOpenAI(
        api_key=os.getenv('AZURE_OPENAI_API_KEY'),
        azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
        api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
        temperature=0.15,
        max_tokens=None,
        timeout=None,
        # max_retries=3,
    )
    problem_chain = LLMMathChain.from_llm(llm=llm)
    math_tool = Tool.from_function(name="Calculator",
                    func=problem_chain.run,
                    description="""Useful for when you need to answer questions 
                    about math. This tool is only for math questions and nothing else. Only input
                    math expressions.""")

    @tool
    def calculator(expression: str) -> str:
        """Calculate expression using Python's numexpr library.

        Expression should be a single line mathematical expression
        that solves the problem.

        Examples:
            "37593 * 67" for "37593 times 67"
            "37593**(1/5)" for "37593^(1/5)"
        """
        local_dict = {"pi": math.pi, "e": math.e}
        return str(
            numexpr.evaluate(
                expression.strip(),
                global_dict={},  # restrict access to globals
                local_dict=local_dict,  # add common mathematical functions
            )
        )

    word_problem_template = """You are a reasoning agent tasked with solving 
    the user's logic-based questions. Logically arrive at the solution, and be 
    factual. In your answers, clearly detail the steps involved and give the 
    final answer. Provide the response in bullet points. 
    Question  {question}"""

    math_assistant_prompt = PromptTemplate(input_variables=["question"],
                                        template=word_problem_template
                                        )
    word_problem_chain = LLMChain(llm=llm,
                                prompt=math_assistant_prompt)
    word_problem_tool = Tool.from_function(name="Reasoning Tool",
                                        func=word_problem_chain.run,
                                        description="Useful for when you need to answer logic-based/reasoning questions.",
                                        )

    python_repl = PythonREPL()
    repl_tool = Tool(
        name="python_repl",
        description="A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
        func=python_repl.run,
    )

    agent = initialize_agent(
        tools=[math_tool,calculator,word_problem_tool,repl_tool],
        # tools=[math_tool],
        llm=llm,
        agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        verbose=False,
        handle_parsing_errors=True
    )

    prompt_str = """
    You are reading a financial document. 
    Provide an answer to the following question based on the information provided in the text.
    Give the answer only without repeating the question or statement.
    \n\n{text}\n\n
    Question: {question}
    """

    prompt_template = PromptTemplate(
        template=prompt_str,
        input_variables=["text", "question"],
    )

    chain = prompt_template | agent 

    response = chain.invoke({
        'text': text,
        'question':question
    })

    return response

# qa_maths_reasoning(text,question)


For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_experimental.utilities.python import PythonREPL


## RAG With Math Tool (Langgraph)

In [51]:
import math
from typing import Annotated, Sequence

from IPython.display import Image, display
from langchain_core.messages import BaseMessage
from langchain_core.runnables import RunnableConfig
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt.tool_node import ToolNode
import numexpr
from typing_extensions import TypedDict

from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser

@tool
def calculator(expression: str) -> str:
    """Calculate expression using Python's numexpr library.

    Expression should be a single line mathematical expression
    that solves the problem.
    If proportion or portion is mentioned, give the answer as a percentage.

    Examples:
        "37593 * 67" for "37593 times 67"
        "37593**(1/5)" for "37593^(1/5)"
    """
    local_dict = {"pi": math.pi, "e": math.e}
    return str(
        numexpr.evaluate(
            expression.strip(),
            global_dict={},  # restrict access to globals
            local_dict=local_dict,  # add common mathematical functions
        )
    )

llm = AzureChatOpenAI(
    api_key=os.getenv('AZURE_OPENAI_API_KEY'),
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
    temperature=0,
    max_tokens=None,
    timeout=None,
)
tools = [calculator]
# Remove the tool_choice parameter for older API versions
llm_with_tools = llm.bind_tools(tools)

class ChainState(TypedDict):
    """LangGraph state."""
    messages: Annotated[Sequence[BaseMessage], add_messages]
    ground_truth: str
    score_reasoning: str
    score: bool
    response: str

def call_chain(state: ChainState, config: RunnableConfig):
    response = llm_with_tools.invoke(state["messages"], config)
    ground_truth = state["ground_truth"]
    return {"messages": [response],"ground_truth":ground_truth,}

def call_model(state: ChainState, config: RunnableConfig):
    response = llm.invoke(state["messages"], config)
    ground_truth = state["ground_truth"]
    return {"messages": [response],"ground_truth":ground_truth,}

def clean_response(state: ChainState, config: RunnableConfig) -> str:
    question = state['messages'][0].content
    last_message = state["messages"][-1]
    ground_truth = state["ground_truth"]
    prompt_str = """
    Give only the final answer without re-stating the question.
    \nResponse:\n{messages}\n
    Question:\n{question}\n
    """

    prompt_template = PromptTemplate(
        template=prompt_str,
        input_variables=["messages"],
    )

    chain = prompt_template | llm | StrOutputParser()

    response = chain.invoke({'question':question,'messages':last_message})

    return {"response":response, "messages": [response],"ground_truth":ground_truth}

def llm_score(state: ChainState, config: RunnableConfig) -> str:
    """Use the LLM to score the text and question.
    """

    # prompt_str = """
    # Return True if the ground truth is in the model answer or False if it is not. 
    # Compare the model answer and the ground truth with tolerance.
    # Ground truth: {ground_truth}\n
    # Model answer: {answer}\n
    # """

    prompt_str = """
    Compare the numerical values in the ground truth and model answer to decide if they are the same answer.
    Follow these steps:
    1. Extract the numerical value from both answers. Ignore the units.
    2. Return True if the absolute values are equivalent within a reasonable margin, False otherwise
    
    Ground truth: {ground_truth}
    Model answer: {answer}
    
    Let's solve this step by step:
    1. Ground truth number: [extract number]
    2. Compare with tolerance: [comparison result]
    
    Final answer (True/False): """

    prompt_template = PromptTemplate(
        template=prompt_str,
        input_variables=["answer", "ground_truth"],
    )

    ground_truth = state["ground_truth"]
    # print(state['messages'][1].content)
    query = state['messages'][1].content
    answer = state['response']

    chain = prompt_template | llm | StrOutputParser()

    score = chain.invoke({
        # 'query':query,
        'answer':answer,
        'ground_truth':ground_truth
        })
    

    return {
        "response":state['response'],
        "messages": state['messages'],
        "ground_truth":ground_truth, 
        'score_reasoning':score
        }

def clean_response_score(state: ChainState, config: RunnableConfig) -> str:
    prompt_str = """
    Give only the final answer (True/False) from the score response.
    \nResponse:\n{score_reasoning}\n
    """

    prompt_template = PromptTemplate(
        template=prompt_str,
        input_variables=["score"],
    )

    chain = prompt_template | llm | StrOutputParser()

    response = chain.invoke({'score_reasoning':state['score_reasoning'],})

    return {
        "response":state['response'],
        "messages": state['messages'],
        "ground_truth":state['ground_truth'], 
        'score_reasoning':state['score_reasoning'],
        'score':response
        }

graph_builder = StateGraph(ChainState)
graph_builder.add_node("call_tool", call_chain)
graph_builder.add_node("execute_tool", ToolNode(tools))
graph_builder.add_node("call_model", call_model)
graph_builder.add_node("clean_response", clean_response)
graph_builder.add_node("llm_score", llm_score)
graph_builder.add_node("clean_response_score", clean_response_score)
graph_builder.set_entry_point("call_tool")
graph_builder.add_edge("call_tool", "execute_tool")
graph_builder.add_edge("execute_tool", "call_model")
graph_builder.add_edge("call_model", "clean_response")
graph_builder.add_edge("clean_response", 'llm_score')
graph_builder.add_edge('llm_score','clean_response_score')
graph_builder.add_edge('clean_response_score',END)
chain = graph_builder.compile()

def qa_maths_reasoning_langgraph(text: str, question: str, ground_truth):
    query = f"""read the following text:\n---{text}\n---\nQuestion: {question}"""
    result = chain.invoke({'messages': ['user', query], 'ground_truth': ground_truth})
    result['question'] = question
    return result

# display(Image(chain.get_graph().draw_mermaid_png()))
# result = qa_maths_reasoning_langgraph(text, question, answer)

In [107]:
import math
import numexpr
from typing import Annotated, Sequence
from typing_extensions import TypedDict

from IPython.display import Image, display
from langchain_core.messages import BaseMessage
from langchain_core.runnables import RunnableConfig
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt.tool_node import ToolNode
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser

@tool
def calculator(expression: str) -> str:
    """Calculate expression using Python's numexpr library.

    Expression should be a single line mathematical expression
    that solves the problem.
    If proportion or portion is mentioned, give the answer as a percentage.

    Examples:
        "37593 * 67" for "37593 times 67"
        "37593**(1/5)" for "37593^(1/5)"
    """
    local_dict = {"pi": math.pi, "e": math.e}
    return str(
        numexpr.evaluate(
            expression.strip(),
            global_dict={},  # restrict access to globals
            local_dict=local_dict,  # add common mathematical functions
        )
    )

llm = AzureChatOpenAI(
    api_key=os.getenv('AZURE_OPENAI_API_KEY'),
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
    api_version=os.getenv('AZURE_OPENAI_API_VERSION'),
    temperature=0,
    max_tokens=None,
    timeout=None,
)
tools = [calculator]
# Remove the tool_choice parameter for older API versions
llm_with_tools = llm.bind_tools(tools)

class ChainState(TypedDict):
    """LangGraph state with enhanced scoring."""
    messages: Annotated[Sequence[BaseMessage], add_messages]
    ground_truth: str
    score_reasoning: str
    detailed_score: dict
    consensus_score: bool
    overall_score: float
    response: str

def call_chain(state: ChainState, config: RunnableConfig):
    response = llm_with_tools.invoke(state["messages"], config)
    ground_truth = state["ground_truth"]
    return {"messages": [response],"ground_truth":ground_truth,}

def call_model(state: ChainState, config: RunnableConfig):
    response = llm.invoke(state["messages"], config)
    ground_truth = state["ground_truth"]
    return {"messages": [response],"ground_truth":ground_truth,}

def clean_response(state: ChainState, config: RunnableConfig) -> str:
    question = state['messages'][0].content
    last_message = state["messages"][-1]
    ground_truth = state["ground_truth"]
    prompt_str = """
    Give only the final answer with its unit where relevant without re-stating the question.
    \nResponse:\n{messages}\n
    Question:\n{question}\n
    """

    prompt_template = PromptTemplate(
        template=prompt_str,
        input_variables=["messages"],
    )

    chain = prompt_template | llm | StrOutputParser()

    response = chain.invoke({'question':question,'messages':last_message})

    return {"response":response, "messages": [response],"ground_truth":ground_truth}

class ChainState(TypedDict):
    """LangGraph state with enhanced scoring."""
    messages: Annotated[Sequence[BaseMessage], add_messages]
    ground_truth: str
    score_reasoning: str
    detailed_score: dict
    consensus_score: bool
    response: str

def advanced_scorer(state: ChainState, config: RunnableConfig) -> str:
    """Advanced scoring agent that considers multiple aspects of the answer."""
    
    prompt_str = """
    Perform a detailed analysis of the model's answer compared to the ground truth.
    Consider multiple aspects in your evaluation:
    
    1. Numerical Accuracy:
       - Extract and compare numerical values
       - Consider acceptable margin of error (±1% or correct to the whole number)
       - Check for unit consistency
    
    2. Conceptual Correctness:
       - Verify if the approach/methodology is correct
       - Check if all required components are present
    
    3. Context Relevance:
       - Ensure the answer addresses the specific question
       - Verify if any contextual requirements are met
    
    Question: {query}
    Ground truth: {ground_truth}
    Model answer: {answer}
    
    Analyze the response and return a JSON object with this exact structure:
    {{
        "numerical_accuracy": {{
            "score": <float between 0 and 1>,
            "reasoning": "<explanation>"
        }},
        "conceptual_correctness": {{
            "score": <float between 0 and 1>,
            "reasoning": "<explanation>"
        }},
        "context_relevance": {{
            "score": <float between 0 and 1>,
            "reasoning": "<explanation>"
        }},
        "overall_score": <float between 0 and 1>,
        "is_correct": <boolean>
    }}
    
    Ensure your response is a valid JSON object matching this structure exactly.
    """

    prompt_template = PromptTemplate(
        template=prompt_str,
        input_variables=["query", "answer", "ground_truth"],
    )

    chain = prompt_template | llm | JsonOutputParser()

    detailed_score = chain.invoke({
        'query': state['messages'][0].content,
        'answer': state['response'],
        'ground_truth': state['ground_truth']
    })

    return {
        "response": state['response'],
        "messages": state['messages'],
        "ground_truth": state['ground_truth'],
        "detailed_score": detailed_score
    }

def consensus_scorer(state: ChainState, config: RunnableConfig) -> str:
    """Combine original score with advanced score for final decision."""
    
    prompt_str = """
    Analyze both scoring approaches and provide a final consensus.
    
    Original scoring reasoning:
    {original_score}
    
    Detailed scoring analysis:
    {detailed_score}
    
    Based on both scoring methods, make a final decision.
    Consider:
    1. The original binary score
    2. The detailed numerical accuracy score
    3. The conceptual correctness score
    4. The context relevance score
    
    Provide your response in exactly this format:
    DECISION: [True/False]
    REASONING: [Your brief explanation]
    """

    prompt_template = PromptTemplate(
        template=prompt_str,
        input_variables=["original_score", "detailed_score"],
    )

    chain = prompt_template | llm | StrOutputParser()

    consensus = chain.invoke({
        'original_score': state['score_reasoning'],
        'detailed_score': str(state['detailed_score'])
    })

    final_decision = consensus.split('\n')[0].replace('DECISION:', '').strip() == 'True'

    return {
        "response": state['response'],
        "messages": state['messages'],
        "ground_truth": state['ground_truth'],
        "score_reasoning": state['score_reasoning'],
        "detailed_score": state['detailed_score'],
        "consensus_score": final_decision
    }

# Update the graph with new scoring nodes
graph_builder = StateGraph(ChainState)
graph_builder.add_node("call_tool", call_chain)
graph_builder.add_node("execute_tool", ToolNode(tools))
graph_builder.add_node("call_model", call_model)
graph_builder.add_node("clean_response", clean_response)
graph_builder.add_node("llm_score", llm_score)
graph_builder.add_node("advanced_scorer", advanced_scorer)
graph_builder.add_node("consensus_scorer", consensus_scorer)
graph_builder.set_entry_point("call_tool")

# Define the enhanced flow
graph_builder.add_edge("call_tool", "execute_tool")
graph_builder.add_edge("execute_tool", "call_model")
graph_builder.add_edge("call_model", "clean_response")
graph_builder.add_edge("clean_response", "llm_score")
graph_builder.add_edge("llm_score", "advanced_scorer")
graph_builder.add_edge("advanced_scorer", "consensus_scorer")
graph_builder.add_edge("consensus_scorer", END)

chain = graph_builder.compile()

def qa_maths_reasoning_langgraph_advanced_scorer(text: str, question: str, ground_truth):
    query = f"""read the following text:\n---{text}\n---\nQuestion: {question}"""
    result = chain.invoke({'messages': ['user', query], 'ground_truth': ground_truth})
    result['question'] = question
    result['overall_score'] = result['detailed_score']['overall_score']
    return result

# display(Image(chain.get_graph().draw_mermaid_png()))
# result = qa_maths_reasoning_langgraph(text, question, answer)

AttributeError: module 'utils.qa_advanced_scorer' has no attribute 'qa_maths_reasoning_langgraph'

In [10]:
reload(qa_advanced_scorer)

# text = """
# gain or loss on ownership change in map results from contributions to map of certain environmental capital expenditures and leased property acquisitions funded by marathon and ashland .
# in accordance with map 2019s limited liability company agreement , in certain instances , environmental capital expenditures and acquisitions of leased properties are funded by the original contributor of the assets , but no change in ownership interest may result from these contributions .
# an excess of ashland funded improvements over marathon funded improvements results in a net gain and an excess of marathon funded improvements over ashland funded improvements results in a net loss .
# cost of revenues increased by $ 5.822 billion in 2004 from 2003 and by $ 6.040 billion in 2003 from 2002 .
# the increases are primarily in the rm&t segment and result from higher acquisition costs for crude oil , refined products , refinery charge and blend feedstocks and increased manufacturing expenses .
# selling , general and administrative expenses increased by $ 105 million in 2004 from 2003 and by $ 97 million in 2003 from 2002 .
# the increase in 2004 was primarily due to increased stock-based compensation and higher costs associated with business transformation and outsourcing .
# our 2004 results were also impacted by start-up costs associated with the lng project in equatorial guinea and the increased cost of complying with governmental regulations .
# the increase in 2003 was primarily due to increased employee benefit expenses ( caused by increased pension expense resulting from changes in actuarial assumptions and a decrease in realized returns on plan assets ) and other employee related costs .
# additionally , during 2003 , we recorded a charge of $ 24 million related to organizational and business process changes .
# inventory market valuation reserve ( 2018 2018imv 2019 2019 ) is established to reduce the cost basis of inventories to current market value .
# generally , we will establish an imv reserve when crude oil prices fall below $ 22 per barrel .
# the 2002 results of operations include credits to income from operations of $ 71 million , reversing the imv reserve at december 31 , 2001 .
# net interest and other financial costs decreased by $ 25 million in 2004 from 2003 and by $ 82 million in 2003 from 2002 .
# the decrease in 2004 is primarily due to an increase in interest income .
# the decrease in 2003 is primarily due to an increase in capitalized interest related to increased long-term construction projects , the favorable effect of interest rate swaps , the favorable effect of a reduction in interest on tax deficiencies and increased interest income on investments .
# additionally , included in net interest and other financing costs are foreign currency gains of $ 9 million , $ 13 million and $ 8 million for 2004 , 2003 and 2002 .
# loss from early extinguishment of debt in 2002 was attributable to the retirement of $ 337 million aggregate principal amount of debt , resulting in a loss of $ 53 million .
# minority interest in income of map , which represents ashland 2019s 38 percent ownership interest , increased by $ 230 million in 2004 from 2003 and by $ 129 million in 2003 from 2002 .
# map income was higher in 2004 compared to 2003 and in 2003 compared to 2002 as discussed below in the rm&t segment .
# minority interest in loss of equatorial guinea lng holdings limited , which represents gepetrol 2019s 25 percent ownership interest , was $ 7 million in 2004 , primarily resulting from gepetrol 2019s share of start-up costs associated with the lng project in equatorial guinea .
# provision for income taxes increased by $ 143 million in 2004 from 2003 and by $ 215 million in 2003 from 2002 , primarily due to $ 388 million and $ 720 million increases in income before income taxes .
# the effective tax rate for 2004 was 36.6 percent compared to 36.6 percent and 42.1 percent for 2003 and 2002 .
# the higher rate in 2002 was due to the united kingdom enactment of a supplementary 10 percent tax on profits from the north sea oil and gas production , retroactively effective to april 17 , 2002 .
# in 2002 , we recognized a one-time noncash deferred tax adjustment of $ 61 million as a result of the rate increase .
# the following is an analysis of the effective tax rate for the periods presented: .

# table:

# [['', '2004', '2003', '2002'], ['statutory tax rate', '35.0% ( 35.0 % )', '35.0% ( 35.0 % )', '35.0% ( 35.0 % )'], ['effects of foreign operations ( a )', '1.3', '-0.4 ( 0.4 )', '5.6'], ['state and local income taxes after federal income tax effects', '1.6', '2.2', '3.9'], ['other federal tax effects', '-1.3 ( 1.3 )', '-0.2 ( 0.2 )', '-2.4 ( 2.4 )'], ['effective tax rate', '36.6% ( 36.6 % )', '36.6% ( 36.6 % )', '42.1% ( 42.1 % )']]

# ( a ) the deferred tax effect related to the enactment of a supplemental tax in the u.k .
# increased the effective tax rate 7.0 percent in .
# """

text = """
( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options .
shareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing .
the following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 index , and the dow jones transportation average .
the comparison of the total cumulative return on investment , which is the change in the quarterly stock price plus reinvested dividends for each of the quarterly periods , assumes that $ 100 was invested on december 31 , 2004 in the s&p 500 index , the dow jones transportation average , and our class b common stock .
comparison of five year cumulative total return $ 40.00 $ 60.00 $ 80.00 $ 100.00 $ 120.00 $ 140.00 $ 160.00 2004 20092008200720062005 s&p 500 ups dj transport .

table:

[['', '12/31/04', '12/31/05', '12/31/06', '12/31/07', '12/31/08', '12/31/09'], ['united parcel service inc .', '$ 100.00', '$ 89.49', '$ 91.06', '$ 87.88', '$ 70.48', '$ 75.95'], ['s&p 500 index', '$ 100.00', '$ 104.91', '$ 121.48', '$ 128.15', '$ 80.74', '$ 102.11'], ['dow jones transportation average', '$ 100.00', '$ 111.65', '$ 122.61', '$ 124.35', '$ 97.72', '$ 115.88']]

.
"""

# question = "by what percent did effects of foreign operations decrease from 2002 to 2004?"
question = "what was the difference in percentage cumulative return on investment for united parcel service inc . compared to the s&p 500 index for the five year period ended 12/31/09?"

# ground_truth = "-76.8%"
ground_truth = "-26.16%"

# result = qa_maths_reasoning_langgraph(text, question, ground_truth)
result = qa_advanced_scorer.qa_maths_reasoning_langgraph_advanced_scorer(text, question, ground_truth)





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to calculate the cumulative return on investment for United Parcel Service Inc. and the S&P 500 index over the specified five-year period, and then find the difference in percentage between the two.

The cumulative return for United Parcel Service Inc. is calculated as:
\[
\text{Cumulative Return (UPS)} = \frac{75.95 - 100}{100} \times 100
\]

The cumulative return for the S&P 500 index is calculated as:
\[
\text{Cumulative Return (S&P 500)} = \frac{102.11 - 100}{100} \times 100
\]

Next, I will find the difference in percentage between the two returns.

Action:
```
{
  "action": "calculator",
  "action_input": "(75.95 - 100) / 100 * 100 - (102.11 - 100) / 100 * 100"
}
```[0m[36;1m[1;3m-26.159999999999997[0m[32;1m[1;3mAction:
```
{
  "action": "Final Answer",
  "action_input": "The difference in percentage cumulative return on investment for United Parcel Service Inc. compared to the S&P 500 index for th

In [9]:
result

{'messages': [HumanMessage(content='user', additional_kwargs={}, response_metadata={}, id='72c1e249-0b5a-410f-b191-f1738114280b'),
  HumanMessage(content="read the following text:\n---\n( 1 ) includes shares repurchased through our publicly announced share repurchase program and shares tendered to pay the exercise price and tax withholding on employee stock options .\nshareowner return performance graph the following performance graph and related information shall not be deemed 201csoliciting material 201d or to be 201cfiled 201d with the securities and exchange commission , nor shall such information be incorporated by reference into any future filing under the securities act of 1933 or securities exchange act of 1934 , each as amended , except to the extent that the company specifically incorporates such information by reference into such filing .\nthe following graph shows a five-year comparison of cumulative total shareowners 2019 returns for our class b common stock , the s&p 500 

## Test

In [14]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def score_data(data, verbose=False, convert_to_df=True, advanced=True):
    result_list = []
    for i,r in data.iterrows():

        text =r['source_text']
        qa = r['qa_exploded']

        if str(qa)!='nan':
            result = {}
            question = qa['question']
            answer = qa['answer']
            if advanced:
                response = qa_advanced_scorer.qa_maths_reasoning_langgraph_advanced_scorer(text,question, answer)
                answer_llm = response['response']
                score = response['detailed_score']['overall_score']
            else:
                response = qa_basic_scorer.qa_maths_reasoning_langgraph(text,question, answer)#[-1].content
                answer_llm = response['response']
                display(response)
                score = response['score']

            if verbose:
                print(f'Question: {question}')
                print(f'Answer (Ground Truth): {answer}')
                print(f'Answer (LLM): {answer_llm}')
                print(f'Score: {score}')
                print('\n')

            result_list.append(response)

        else:
            print('no qa')

    if convert_to_df:
        return pd.DataFrame(result_list)

    return result_list

def process_row(row, verbose=False, advanced=False):
    text = row['source_text']
    qa = row['qa_exploded']

    if str(qa) != 'nan':
        question = qa['question']
        answer = qa['answer']
        print(question)
        if advanced:
            response = qa_advanced_scorer.qa_maths_reasoning_langgraph_advanced_scorer(text, question, answer)
            answer_llm = response['response']
            score = response['detailed_score']['overall_score']
        else:
            response = qa_basic_scorer.qa_maths_reasoning_langgraph(text, question, answer)
            answer_llm = response['response']
            score = response['score']

        if verbose:
            print(f'Question: {question}')
            print(f'Answer (Ground Truth): {answer}')
            print(f'Answer (LLM): {answer_llm}')
            print(f'Score: {score}')
            print('\n')

        return response
    else:
        print('no qa')
        return None

def score_data_concurrently(data, verbose=False, convert_to_df=True, max_workers=5, advanced=False):
    result_list = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_row = {executor.submit(process_row, row, verbose,advanced=advanced): i for i, row in data.iterrows()}

        for future in as_completed(future_to_row):
            result = future.result()
            if result is not None:
                result_list.append(result)

    if convert_to_df:
        return pd.DataFrame(result_list)

    return result_list

# df_scores = score_data(data_clean[:1], advanced=True)
# df_scores = score_data_concurrently(data_clean[:5],advanced_scorer=True)

In [20]:
import numpy as np
from sklearn.metrics import accuracy_score
reload(qa_advanced_scorer)

def process_sample(sample_df):
    # Calculate F1 score for the current sample based on 'score' column
    y_true = sample_df['overall_score']
    y_pred = [1] * len(y_true)  # assuming `True` indicates correct predictions by the model

    # Calculate multiple metrics
    metrics = {
        'accuracy': np.mean(y_true),
    }
    return metrics, sample_df

def evaluate_samples(data, k, n, max_workers=5, advanced=False, concurrent=False):
    metrics_list = []
    sample_dfs = []

    def get_random_sample():
        # Randomly sample `n` rows
        # df_scores = score_data_concurrently(data.sample(n=n),advanced=advanced)
        if concurrent:
            df_scores = score_data_concurrently(data.sample(n=n), advanced=advanced)
        else:
            df_scores = score_data(data.sample(n=n), advanced=advanced)
        # display(df_scores)
        # df_scores['score'] = df_scores['score'].apply(lambda x: 1 if str(x)=='True' else 0)
        # display(df_scores)
        return df_scores

    # Use ThreadPoolExecutor to calculate F1 scores concurrently
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create a list of future tasks for each sample
        futures = [executor.submit(process_sample, get_random_sample()) for _ in range(k)]

        # Collect results as they complete
        for future in as_completed(futures):
            metrics, sample_df = future.result()
            metrics_list.append(metrics)
            sample_dfs.append(sample_df)

    # Aggregate results by averaging each metric across all samples
    aggregated_metrics = {
        'accuracy': np.mean([metrics['accuracy'] for metrics in metrics_list]),
    }

    concatenated_df = pd.concat(sample_dfs, ignore_index=True)

    return aggregated_metrics, concatenated_df

# Example usage:
k = 1 # Number of random samples to test
n = 5   # Number of rows in each sample
metrics, df_scores = evaluate_samples(data_clean, k, n, advanced=True, concurrent=True)
print("F1 Scores:", metrics)

in december 2012 what was the percentage difference in the carrying values of the long-term debt excluding current portion
what percentage of total non-recourse debt as of december 31 , 2010 is due in 2012?
what percent of the muilti asset value is from the asset allocation and balanced section?
what was the total collateral of all types december 31 , 2009?
what was the rate of the income tax benefit based on the stock compensation




F1 Scores: {'accuracy': 0.72}


In [21]:
display(df_scores['overall_score'].describe(), metrics,)

count    5.000000
mean     0.720000
std      0.438178
min      0.000000
25%      0.600000
50%      1.000000
75%      1.000000
max      1.000000
Name: overall_score, dtype: float64

{'accuracy': 0.72}

In [19]:
for i,r in df_scores.loc[df_scores['overall_score']==0].iterrows():
    ground_truth = r['ground_truth']
    question = r['question']
    # score_reasoning = r['score_reasoning']
    detailed_score = r['detailed_score']
    score = r['overall_score']
    response = r['response']
    print(f'Ground Truth: {ground_truth}')
    print(f'Question: {question}')
    print(f'Response: {response}')
    # print(f'Score reasoning: {score_reasoning}')
    display(detailed_score)
    print(f'Score: {score}')
    print('\n-----------')

Ground Truth: 127
Question: what was the change in unrecognized tax benefits between 2008 and 2009?
Response: -$127 million


{'numerical_accuracy': {'score': 0.0,
  'reasoning': 'The model answer of -$127 million does not match the ground truth of 127. The absolute values do not match closely, and the negative sign indicates a decline, which is not equivalent to the positive ground truth value.'},
 'conceptual_correctness': {'score': 0.0,
  'reasoning': "The model's answer does not correctly represent the numerical value requested. The methodology of providing a negative value when a positive value was expected is incorrect."},
 'context_relevance': {'score': 0.0,
  'reasoning': 'The model answer does not address the specific question as it provides a negative value instead of the expected positive value. The context of the question is not met.'},
 'overall_score': 0.0,
 'is_correct': False}

Score: 0.0

-----------
Ground Truth: 0%
Question: now much of the net increase in aro during the period was due to accretion , in millions?
Response: 0 million


{'numerical_accuracy': {'score': 0.0,
  'reasoning': 'The ground truth is 0%, while the model answer is 0 million. These values are not numerically equivalent, as 0% represents a proportion and 0 million represents a quantity. They do not match within the acceptable margin of error.'},
 'conceptual_correctness': {'score': 0.0,
  'reasoning': "The model's answer does not correctly address the question as it provides a quantity (0 million) instead of a percentage (0%). The methodology of interpreting the question is incorrect."},
 'context_relevance': {'score': 0.0,
  'reasoning': "The model's answer does not address the specific question regarding a percentage. The context of the question requires a percentage response, which the model fails to provide."},
 'overall_score': 0.0,
 'is_correct': False}

Score: 0.0

-----------


In [123]:
data_clean.loc[data_clean['question']=='considering the years 2015-2016 , how bigger is the growth of the third-party sales for the engineered products and solutions segment in comparison with the transportation and construction solutions one?']['source_text'][316]

"third-party sales for the engineered products and solutions segment improved 7% ( 7 % ) in 2016 compared with 2015 , primarily attributable to higher third-party sales of the two acquired businesses ( $ 457 ) , primarily related to the aerospace end market , and increased demand from the industrial gas turbine end market , partially offset by lower volumes in the oil and gas end market and commercial transportation end market as well as pricing pressures in aerospace .\nthird-party sales for this segment improved 27% ( 27 % ) in 2015 compared with 2014 , largely attributable to the third-party sales ( $ 1310 ) of the three acquired businesses ( see above ) , and higher volumes in this segment 2019s legacy businesses , both of which were primarily related to the aerospace end market .\nthese positive impacts were slightly offset by unfavorable foreign currency movements , principally driven by a weaker euro .\natoi for the engineered products and solutions segment increased $ 47 , or 8