In [2]:
import sys
import pprint

sys.path.append('../')

In [None]:
from langchain_tavily import TavilySearch
from langchain.chat_models import init_chat_model
from langgraph.graph import StateGraph
from langchain.prompts import ChatPromptTemplate
from config import settings
from src import prompts
from typing import TypedDict
from pydantic import BaseModel
import ast
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage
import pandas as pd
from pydantic import BaseModel
from typing import List
from datetime import datetime

#local sales data to use 
df = pd.read_excel('..\data\Superstore.xlsx')


# define model to use
model = init_chat_model(
    "gemini-2.0-flash",
    model_provider="google_genai",
    temperature=0,
)


# define tool(s) that are available to use
tavily = TavilySearch(
    max_results=settings.MAX_SEARCH_RESULTS_PER_QUESTION,
    topic="general",
)


# define main state object
class AgentState(TypedDict):
    initial_request: str
    search_queries: list[str]
    search_results: list[str]
    df:pd.DataFrame
    insight_summary: str
    final_report: str
    # TODO: expand state object as needed for other nodes


# define structured output formats
class SearchQueries(BaseModel):
    queries: list[str]

class SubcategoryList(BaseModel):
    subcategories: List[str]

# define graph nodes
def query_generator_node(state: AgentState):
    queries = model.with_structured_output(SearchQueries).invoke([
        SystemMessage(content=prompts.QUERY_PLANNER_PROMPT),
        HumanMessage(content=state['initial_request'])
    ])
    return {'search_queries': queries.queries}


def search_executor_node(state: AgentState):
    results = []
    for q in state['search_queries']:
        response = tavily.invoke({'query': q})
        for r in response['results']:
            results.append(r['content'])
    return {'search_results': results} 

def data_summarizer_node(state: AgentState):
    df = state['df']
    initial_request = state['initial_request']

    # Step 0: Dynamically shift order date to make it more recent
    # Ensure date column is in datetime format
    df["Order Date"] = pd.to_datetime(df["Order Date"])

    # Step 1: Get max date in the column
    max_date = df["Order Date"].max()

    # Step 2: Calculate day difference from today
    today = pd.Timestamp(datetime.today().date())
    day_shift = (today - max_date).days

    # Step 3: Shift all dates
    df["Order Date"] = df["Order Date"] + pd.to_timedelta(day_shift, unit='D')

    # Step 1: Extract category values
    all_categories = df["Category"].dropna().unique().tolist()
    all_subcategories = df["Sub-Category"].dropna().unique().tolist()

    # Step 2: Use LLM to map prompt to subcategories
    category_prompt = ChatPromptTemplate.from_template("""
    You are helping analyze a sales dataset.

    The dataset includes:
    Main Categories: {categories}
    Sub-Categories: {subcategories}

    Given this campaign prompt:
    "{initial_request}"

    Only return a JSON object in the following format (and nothing else):
    {{"subcategories": ["Sub-Category-1", "Sub-Category-2", ...]}}

    Do NOT explain your reasoning or include any extra text. Just return the JSON object.
    """)

    messages = category_prompt.format_messages(
        categories=all_categories,
        subcategories=all_subcategories,
        initial_request=initial_request
    )

    response = model.with_structured_output(SubcategoryList).invoke(messages)
    matched_subcategories = response.subcategories

    print("\n🔎 Matched Sub-Categories from LLM:")
    print(matched_subcategories)

    # Step 3: Filter data
    df_filtered = df[df["Sub-Category"].isin(matched_subcategories)].copy()
    df_filtered["Order Date"] = pd.to_datetime(df_filtered["Order Date"])
    df_filtered["year"] = df_filtered["Order Date"].dt.year

    # Step 4: Identify the latest two years in the dataset
    latest_years = sorted(df_filtered["year"].dropna().unique())[-2:]
    if len(latest_years) < 2:
        return {"insight_summary": "Not enough years of data for YoY comparison."}

    year_new, year_old = latest_years[1], latest_years[0]

    # Step 5: YoY aggregation
    grouped = df_filtered.groupby(["Sub-Category", "Region", "year"]).agg({
        "Sales": "sum",
        "Profit": "sum"
    }).reset_index()

    df_new = grouped[grouped["year"] == year_new].set_index(["Sub-Category", "Region"])
    df_old = grouped[grouped["year"] == year_old].set_index(["Sub-Category", "Region"])

    yoy = df_new.join(df_old, lsuffix=f"_{year_new}", rsuffix=f"_{year_old}", how="inner")
    yoy["sales_yoy"] = ((yoy[f"Sales_{year_new}"] - yoy[f"Sales_{year_old}"]) / yoy[f"Sales_{year_old}"]) * 100
    yoy["profit_yoy"] = ((yoy[f"Profit_{year_new}"] - yoy[f"Profit_{year_old}"]) / yoy[f"Profit_{year_old}"]) * 100
    yoy.reset_index(inplace=True)

    # Step 6: Generate insight summary with LLM
    summary_prompt = ChatPromptTemplate.from_template("""
You are a BI analyst. Summarize the following YoY data in clear bullet points.
Focus on major % increases or decreases in sales or profit by sub-category and region.

Data:
{yoy_data}
""")
    summary_input = summary_prompt.format_messages(yoy_data=yoy.to_csv(index=False))
    insight_summary = model.invoke(summary_input).content

    #return {"insight_summary": insight_summary}
    return {
        "insight_summary": insight_summary
    }

def insight_aggregator_node(state: AgentState):
    search_results = state['search_results']
    insight_summary = state['insight_summary']
    initial_request = state['initial_request']

    prompt = f"""
You're a marketing analyst. Create a one-page summary report combining web insights and sales data insight summary for a {initial_request}.
Focus on trends, regional patterns, and recommended actions. 

Web insights:
{chr(10).join(f"- {ws}" for ws in search_results)}

Sales data summaries:
- {insight_summary}

Return output as:
1. Executive Summary
2. Key Insights
3. Recommended Actions
"""
    messages = [HumanMessage(content=prompt)]
    response = model.invoke(messages)

    return {"final_report": response.content}



# build graph
graph_builder = StateGraph(AgentState)

graph_builder.add_node('query_generator', query_generator_node)
graph_builder.add_node('search_executor', search_executor_node)
graph_builder.add_node('data_summarizer', data_summarizer_node)
graph_builder.add_node('insight_aggregator', insight_aggregator_node)

graph_builder.add_edge('query_generator', 'search_executor')
graph_builder.add_edge('search_executor', 'data_summarizer')
graph_builder.add_edge('data_summarizer', 'insight_aggregator')

graph_builder.set_entry_point('query_generator')
graph = graph_builder.compile()

#execute
if __name__ == "__main__":
    import pandas as pd

    df = pd.read_excel('..\data\Superstore.xlsx')

    initial_state = {
        "initial_request": "plan a campaign on school Furniture in northeast America",
        "df": df
    }

    final_state = graph.invoke(initial_state)
    
    print(final_state["final_report"])


🔎 Matched Sub-Categories from LLM:
['Chairs', 'Bookcases', 'Tables']
## School Furniture Campaign Plan: Northeast America

**Marketing Analysis Summary Report**

**Date:** October 26, 2023

**1. Executive Summary**

The U.S. school furniture market is experiencing steady growth, projected to reach USD 2.06 billion in 2024 and grow at a CAGR of 6.2% from 2025 to 2030. This growth is fueled by increasing investments in educational infrastructure, a rising demand for ergonomic and flexible learning environments, and a focus on student well-being. While the national trend is positive, sales data reveals regional disparities. This report focuses on developing a targeted campaign for the Northeast, leveraging national trends while addressing specific regional challenges and opportunities. The campaign will prioritize ergonomic seating, flexible learning solutions, and address the need for modernization of school facilities.

**2. Key Insights**

**A. Market Trends (Web Insights):**

*   **O

: 