
# Customer Text Summarization — Concurrency + Batching

- **Python concurrency**: `ThreadPoolExecutor` processes multiple batches in parallel
- **Batch processing**: `llm.batch(prompts)` per chunk
- **Single, simple prompt** (no complex fallbacks)


In [None]:
!pip install OpenAI
!pip install langchain
!pip install langchain_community
!pip install Cohere
!pip install langchain-openai langchain-cohere python-dotenv

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
from google.colab import userdata

In [None]:
openai_key = userdata.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_key

In [None]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_openai import ChatOpenAI

In [None]:

CSV_PATH     = "https://raw.githubusercontent.com/giridhar276/genai/refs/heads/main/datasets/Bank_Customer_conversations.csv"   # <- set your CSV path
TEXT_COL     = "customer_text"
MODEL        = "gpt-4o-mini"
TEMPERATURE  = 0
TIMEOUT      = 60

BATCH_SIZE   = 40      # rows per batch call
MAX_WORKERS  = 4       # parallel batches

OUTPUT_PATH  = CSV_PATH.replace(".csv", "_with_summary_concurrent.csv")




In [None]:

# Single summarization prompt (simple)
PROMPT_TMPL = (
    "Summarize the customer's message in ONE clear sentence focusing on intent/issue and requested action. "
    "Do not add details that are not present. Return only the summary text on a single line.\n\n"
    "CUSTOMER TEXT:\n"
    '\"\"\"{text}\"\"\"\n'
    "Summary:"
)


In [None]:

# Load
df = pd.read_csv(CSV_PATH)
if TEXT_COL not in df.columns:
    raise ValueError(f"Column '{TEXT_COL}' not found in CSV.")
texts = df[TEXT_COL].astype(str).tolist()
len(texts), df.head(3)


In [None]:

# Model
llm = ChatOpenAI(model=MODEL, temperature=TEMPERATURE, timeout=TIMEOUT)

def chunk_list(seq, size):
    for i in range(0, len(seq), size):
        yield i, seq[i:i+size]




In [None]:
def summarize_batch(texts):
    prompts = [PROMPT_TMPL.format(text=t) for t in texts]
    responses = llm.batch(prompts)  # keeps order
    return [r.content.strip() for r in responses]

In [None]:

# Run batches concurrently
chunks = list(chunk_list(texts, BATCH_SIZE))
summaries_out = [None] * len(texts)

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    fut_map = {ex.submit(summarize_batch, batch): (start, batch) for (start, batch) in chunks}
    for fut in as_completed(fut_map):
        start, batch = fut_map[fut]
        result = fut.result()
        summaries_out[start:start+len(batch)] = result
        print(f"Completed rows {start}–{start+len(batch)-1}")

len(summaries_out)


In [None]:

# Save
df["summary"] = summaries_out
df.to_csv("batchprocessing.csv", index=False)
print(f"Saved: {OUTPUT_PATH}")
df[[TEXT_COL, "summary"]].head(10)
