In [1]:
%load_ext autoreload
%autoreload 2

Imports

In [2]:
import sys
import os
import numpy as np
import pandas as pd
from scripts.utils.db import engine
from scripts.agent_script import chatbot_interaction

### Loading the df

In [3]:
df = pd.read_sql("SELECT * FROM marketing_data", engine)
df.head(3)

Unnamed: 0,year,quarter,month,week,date,country,media_category,media_name,communication,campaign_category,product,campaign_name,revenue,cost,profit,roi,margin,quarter_number,month_number,month_name
0,2020,2020 Q3,2020M08,35,2020-08-24,DK,online,Interscroller,Tactical,Category 3,Product 1,Campaign 10,411.522496,9778.544045,-9367.021548,-0.957916,-22.76187,3,8,August
1,2020,2020 Q3,2020M08,35,2020-08-24,DK,online,Interscroller,Tactical,Category 3,Product 1,Campaign 11,228.913948,5083.77,-4854.856052,-0.954972,-21.208214,3,8,August
2,2020,2020 Q3,2020M08,35,2020-08-24,DK,online,Interscroller,Branding,Category 5,Product 1,Campaign 12,71.051079,294.96,-223.908921,-0.759116,-3.15138,3,8,August


In [4]:
interaction_log = list()

# First test, checking if the total revenue for 2021 is the same as the one I calculate

In [5]:
questions = ["Revenue for 2021", 'exit'] # i mean total revenue for 2021

total_revenue_2021 = round(df[df['year'] == 2021]['revenue'].sum(), 2)

interaction_log = chatbot_interaction(questions) # chatbot call with predefined questions

print(f"The tool used is: {interaction_log[0]['tools_used'][0]['tool_name']}")

assert round(float(interaction_log[0]['agent_response']), 2) == total_revenue_2021

interaction_log.clear()

User: Revenue for 2021
SELECT sum(revenue) FROM marketing_data WHERE year = :param_year {'param_year': 2021}
Agent response for your question 'Revenue for 2021' is : 15173138.681923429
User: exit
Agent: Conversation ended.
The tool used is: aggregate_metric_structured


# Second test, checking the same idea but with follow up

In [6]:
questions = ["Revenue for 2021", "same but for 2022", 'exit'] # i mean total revenue for 2021 and 2022

total_revenue_2022 = round(df[df['year'] == 2022]['revenue'].sum(), 2)

interaction_log = chatbot_interaction(questions)

print(f"The tool used is: {interaction_log[0]['tools_used'][0]['tool_name']} and {interaction_log[1]['tools_used'][0]['tool_name']}")

assert round(float(interaction_log[0]['agent_response']), 2) == total_revenue_2021
assert round(float(interaction_log[1]['agent_response']), 2) == total_revenue_2022

interaction_log.clear()

User: Revenue for 2021
Agent response for your question 'Revenue for 2021' is : 15173138.681923429
User: same but for 2022
Agent response for your question 'same but for 2022' is : 30357817.329384245
User: exit
Agent: Conversation ended.
The tool used is: aggregate_metric_simple_where and aggregate_metric_simple_where


# Third test, should use the grouping tool

In [7]:
# here i also calculate just the category and not its name
questions = ["Which media categories had the highest profit in Q2 2023?", "exit"]

interaction_log = chatbot_interaction(questions)
print(f"The tool used is: {interaction_log[0]['tools_used'][0]['tool_name']}")

q2_2023 = df[
    (df["year"] == 2023) &
    (df["quarter_number"] == 2)
]
profit_by_category = (
    q2_2023
    .groupby("media_category", as_index=False)["profit"]
    .sum()
    .rename(columns={"profit": "total_profit"})
    .sort_values("total_profit", ascending=False)
)

assert profit_by_category.iloc[0]['media_category'] == interaction_log[0]['agent_response']

interaction_log.clear()

User: Which media categories had the highest profit in Q2 2023?
Agent response for your question 'Which media categories had the highest profit in Q2 2023?' is : online
User: exit
Agent: Conversation ended.
The tool used is: aggregate_with_grouping


# Checking if a plot is the same, looking at it's ticks

In [23]:
questions = ["monthly revenue for 2023", "exit"]
df_plot = pd.read_csv("scripts/plots_output/cost_month_name_year___2023.csv")

expected_df = (
    df[df["year"] == 2023]
    .groupby(["month_number", "month_name"])["cost"]
    .sum()
    .reset_index()
    .sort_values("month_number")
    .reset_index(drop=True)
)

assert list(df_plot["month_name"]) == list(expected_df["month_name"])
assert np.allclose(df_plot["cost"], expected_df["cost"])

interaction_log.clear()

# Checked query result

In [29]:

questions = ["Top 5 campaign names by revenue in 2023.", "exit"]

interaction_log = chatbot_interaction(questions)
print(f"The tool used is: {interaction_log[0]['tools_used'][0]['tool_name']}")


df_2023 = df[df['date'].dt.year == 2023]

# group by campaign and sum revenue
campaign_revenue = (
    df_2023
    .groupby('campaign_name')['revenue']
    .sum()
    .reset_index()
    .sort_values(by='revenue', ascending=False)
)

top5 = campaign_revenue.head(5)
top5 = top5['campaign_name'].tolist()


agent_output = interaction_log[0]['agent_response']
agent_output_str = str(agent_output)

agent_list = [x.strip() for x in agent_output_str.split(",")]

assert top5 == agent_list

interaction_log.clear()

User: Top 5 campaign names by revenue in 2023.
Agent response for your question 'Top 5 campaign names by revenue in 2023.' is : Campaign 331, Campaign 317, Campaign 314, Campaign 328, Campaign 304
User: exit
Agent: Conversation ended.
The tool used is: aggregate_with_grouping
