In [1]:
from dotenv import load_dotenv

# add a GOOGLE_API_KEY in .env 
# to get the api key, go to google dashboard
# otherwise, the data has already been generated in ../data/raw and ../data/processed
load_dotenv()

True

In [2]:
import random
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel, Field
from typing import Optional, List

In [1]:
import warnings
warnings.filterwarnings("ignore") 

In [10]:
from pathlib import Path
import time

def find_git_root(path: Path = Path.cwd()) -> Path:
    for parent in [path] + list(path.parents):
        if (parent / ".git").exists():
            return parent
    raise FileNotFoundError("No .git directory found in any parent folders.")

root_dir = find_git_root()


In [4]:
product_list = [
    "personal loan",
    "home loan",
    "home loan refinancing",
    "car loan",
    "credit card application",
    "balance transfer",
    "fixed deposit account",
    "unit trust investment",
    "retirement savings plan",
    "foreign currency investment",
    "wealth management advisory",
    "savings account",
    "high-interest savings account",
    "current account",
    "joint account",
    "life insurance",
    "health insurance",
    "mortgage protection insurance",
    "travel insurance",
    "mobile banking setup",
    "credit card limit increase",
    "online payment assistance",
    "foreign remittance services"
]


In [5]:
class Turn(BaseModel):
    speaker: str = Field(description="Either 'BU' for Business User or 'C' for Client")
    message: str = Field(description="A single chat message from the speaker")

class SalesConversationExtraction(BaseModel):
    conversation: List[Turn] = Field(description="Structured chat log between the client and business user")
    product_interest: str = Field(description="Product the customer is interested in")
    customer_certainty: str = Field(description="Level of certainty: 'explicit', 'rephrased', or 'unsure'")
    budget_or_amount: Optional[str] = Field(description="Budget or amount mentioned by the customer, if any")
    urgency: Optional[str] = Field(description="Timeframe or urgency, if mentioned")
    key_preferences: dict = Field(description="Dictionary of preferences like risk, repayment type, rate_type")
    first_time_buyer: Optional[bool] = Field(description="Whether this is the customer's first time buying")
    emotions: List[str] = Field(description="List of customer emotions detected from the conversation")
    cross_sell_opportunity: Optional[str] = Field(description="Other product that could be recommended")
    question_types: List[str] = Field(description="Types of customer questions")


parser = JsonOutputParser(pydantic_object=SalesConversationExtraction)

In [6]:
prompt = PromptTemplate(
    template="""
You are a helpful assistant that simulates a short, realistic conversation between a business user (BU) and a client (C)
over enterprise instant messaging.

The client is interested in the following banking product: **{product_interest}**

The client might:
- Explicitly mention the product,
- Use a rephrased or indirect reference,
- Be unsure or need help figuring out what they need.

The business user should respond to gather relevant information to assist with the sale.

Generate the conversation, and then extract the following key sales information from the conversation.

{format_instructions}
""",
    input_variables=["product_interest"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [None]:
from tqdm.notebook import trange, tqdm

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.8,
    convert_system_message_to_human=True
)

chain = prompt | model | parser

n = 5
results = []

for i in trange(n):
    product_interest = random.choice(product_list)
    try:
        result = chain.invoke({"product_interest": product_interest})
        results.append(result)
    except Exception as e:
        print(f"Error generating conversation {i+1}: {e}")

In [19]:
timestamp = int(time.time())
output_path = root_dir / "data" / "raw" / f"conversations-{timestamp}.json"

output_path.parent.mkdir(parents=True, exist_ok=True)

import json
with open(output_path, "w") as f:
    json.dump([dict(r) for r in results if r is not None], f, indent=2)

print(f"Saved to {output_path}")

Saved to /Users/jackyangara/Workspace/tmp/stat8306/data/raw/conversations-1750493143.json


In [20]:
def flatten_conversation_record(record: dict) -> dict:
    flat = {}

    flat["conversation_text"] = "\n".join(
        f"{turn['speaker']}: {turn['message']}" for turn in record["conversation"]
    )

    flat["product_interest"] = record.get("product_interest")
    flat["customer_certainty"] = record.get("customer_certainty")
    flat["budget_or_amount"] = record.get("budget_or_amount")
    flat["urgency"] = record.get("urgency")
    flat["first_time_buyer"] = record.get("first_time_buyer")
    flat["cross_sell_opportunity"] = record.get("cross_sell_opportunity")

    flat["emotions"] = ", ".join(record.get("emotions", []))
    flat["question_types"] = ", ".join(record.get("question_types", []))

    for k, v in record.get("key_preferences", {}).items():
        flat[f"pref_{k}"] = v

    return flat


In [21]:
import pandas as pd
raw_path = root_dir / "data" / "raw" / f"conversations-{timestamp}.json"
with open(raw_path, "r") as f:
    raw_records = json.load(f)

flat_records = [flatten_conversation_record(r) for r in raw_records]

df = pd.DataFrame(flat_records)

processed_path = root_dir / "data" / "processed" / f"conversations-{timestamp}.csv"
processed_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(processed_path, index=False)

print(f"CSV saved to: {processed_path}")

CSV saved to: /Users/jackyangara/Workspace/tmp/stat8306/data/processed/conversations-1750493143.csv


In [22]:
df

Unnamed: 0,conversation_text,product_interest,customer_certainty,budget_or_amount,urgency,first_time_buyer,cross_sell_opportunity,emotions,question_types,pref_vehicle_type,...,pref_investment_focus,pref_cash_out_amount,pref_current_interest_rate,pref_estimated_home_value,pref_exchange_rate_sensitivity,pref_target_currency,pref_fee_clarity,pref_withdrawal_limits_clarity,pref_current_saving_habits,pref_retirement_timeframe
0,"C: Hi, I'm looking for some information. I nee...",car loan,rephrased,"$45,000",within the next 4-6 weeks,True,,,information_seeking,new SUV,...,,,,,,,,,,
1,"C: Hi, I'm looking for a way to better manage ...",credit card application,unsure,,,True,,"uncertain, exploratory","clarification, informational",,...,,,,,,,,,,
2,"C: Hi, I need some help with something. I'm lo...",foreign remittance services,rephrased,"$5,000 USD",within 2-3 business days,True,foreign currency accounts,"curious, focused, concerned","informational, preference-based",,...,,,,,,,,,,
3,"C: Hi there, I'm looking into options to bette...",health insurance,rephrased,$300-$500 per month,within the next month or so,True,,overwhelmed,information seeking,,...,,,,,,,,,,
4,"C: Hi, I'm looking to get a car loan. Can you ...",,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,"C: Hi there, I'm looking for some advice on my...",high-interest savings account,rephrased,"$25,000","long-term, maybe 3-5 years",True,checking account,"curious, proactive, unsure","information-seeking, clarification",,...,,,,,,,True,True,,
452,"C: Hi, I'm looking for some information on fin...",home loan,rephrased,"$450,000 - $500,000",next 3-4 months,True,home insurance,overwhelmed,informational,,...,,,,,,,,,,
453,"C: Hi, I'm looking for somewhere to put some e...",savings account,rephrased,,,,investment product,,,,...,,,,,,,,,,
454,"C: Hi, I'm looking to open a savings account. ...",savings account,explicit,"$10,000 to start",3-5 years,False,mortgage,"goal-oriented, eager","informational, clarification",,...,,,,,,,,,,
