
# Customer Text Sentiment Analysis

In [None]:
#!pip install OpenAI
#!pip install langchain
#!pip install langchain_community
#!pip install Cohere
#!pip install langchain-openai langchain-cohere python-dotenv


In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
from google.colab import userdata

In [None]:

import os, json, re
import pandas as pd
import matplotlib.pyplot as plt
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage

# --------------------
# CONFIG - EDIT HERE
# --------------------
CSV_PATH = r"https://github.com/giridhar276/genai/raw/refs/heads/main/datasets/Bank_Customer_conversations.csv"   # default to the generated CSV in this environment
TEXT_COL = "customer_text"
MODEL = "gpt-4o-mini"
TEMPERATURE = 0
BATCH_SIZE = 40
TIMEOUT = 60
BINARY_OUTPUT = False

OUTPUT_PATH = CSV_PATH.replace(".csv", "_with_sentiment.csv")

# You must set OPENAI_API_KEY in your environment before running:
openai_key = userdata.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_key
assert os.getenv("OPENAI_API_KEY"), "Please set OPENAI_API_KEY in your environment."


In [None]:

def strict_json_parse(s):
    try:
        return json.loads(s)
    except Exception:
        pass
    m = re.search(r"\{.*\}", s, flags=re.DOTALL)
    if not m:
        return {"label": "neutral", "confidence": 0.33, "reason": "fallback: parse error"}
    try:
        return json.loads(m.group(0))
    except Exception:
        return {"label": "neutral", "confidence": 0.33, "reason": "fallback: json error"}




In [None]:
def postprocess_label(label: str, binary: bool = False) -> str:
    label = (label or "").strip().lower()
    if label not in {"positive", "neutral", "negative"}:
        label = "neutral"
    if binary:
        return "positive" if label == "positive" else "negative"
    return label



In [None]:
def clean_text(x: str) -> str:
    if not isinstance(x, str):
        return ""
    x = x.replace("\r", " ").replace("\n", " ").strip()
    x = re.sub(r"\s+", " ", x)
    return x

In [None]:

df = pd.read_csv(CSV_PATH)
df["customer_text"] = df[TEXT_COL].astype(str).map(clean_text)
df = df[df[TEXT_COL].str.len() > 0].copy()
df.reset_index(drop=True, inplace=True)

print("Rows:", len(df))
df.head(3)


In [None]:

SYSTEM = """You are a strict sentiment classifier for short customer utterances from bank call transcripts.
Return ONLY a compact JSON object on a SINGLE LINE with keys:
- "label": one of "positive", "neutral", "negative"
- "confidence": a number in [0,1]
- "reason": a brief rationale (<= 15 words)
Judge tone + wording; ignore bank-specific facts. Prefer "neutral" if mixed.
No extra text before/after JSON.
"""

USER_TMPL = '''Classify the sentiment of the CUSTOMER text below.
Rules:
- Output strictly ONE LINE of JSON only.
- Labels: "positive" | "neutral" | "negative".
- Keep "reason" short (<= 15 words).
- Consider tone: polite, frustrated, aggressive, harsh.
CUSTOMER:
""" {text} """'''



In [None]:

llm = ChatOpenAI(model=MODEL, temperature=TEMPERATURE, timeout=TIMEOUT)

def build_messages(txt: str):
    return [SystemMessage(content=SYSTEM), HumanMessage(content=USER_TMPL.format(text=txt))]


In [None]:

labels, confs, reasons = [], [], []

def classify_batch(msgs):
  results = llm.batch(msgs)
  return [strict_json_parse(x.content) for x in results]

buffer = []
for i, txt in enumerate(df[TEXT_COL].tolist(), start=1):
    buffer.append(build_messages(txt))
    if len(buffer) >= BATCH_SIZE:
        objs = classify_batch(buffer)
        for o in objs:
            labels.append(postprocess_label(o.get("label"), BINARY_OUTPUT))
            confs.append(float(o.get("confidence", 0.5)))
            reasons.append(o.get("reason", ""))
        buffer = []
        if i % (BATCH_SIZE * 5) == 0:
            print(f"Processed {i}/{len(df)} rows...")

if buffer:
    objs = classify_batch(buffer)
    for o in objs:
        labels.append(postprocess_label(o.get("label"), BINARY_OUTPUT))
        confs.append(float(o.get("confidence", 0.5)))
        reasons.append(o.get("reason", ""))

assert len(labels) == len(df), "Batching length mismatch—check logic."


In [None]:
if df.empty:
    raise RuntimeError("DataFrame is empty—upstream step failed (CSV read or LLM).")

df["sentiment_label"] = labels
df["sentiment_confidence"] = confs
df["sentiment_reason"] = reasons


In [None]:
df

In [None]:
counts = df["sentiment_label"].value_counts().sort_index()
print("\nLabel counts:\n", counts)


In [None]:
plt.figure()
counts.plot(kind="bar")
plt.title("Sentiment label distribution")
plt.xlabel("label")
plt.ylabel("count")
plt.show()
