# Bitext Customer Support LLM Chatbot Training Dataset — EDA

**Dataset:** `bitext/Bitext-customer-support-llm-chatbot-training-dataset`  
**Source:** HuggingFace Datasets Hub

## 1. Setup & Download

In [None]:
# All dependencies are managed via pyproject.toml + uv
# To install/sync:  uv sync
# To register the Jupyter kernel:
#   uv run python -m ipykernel install --user --name contact-center --display-name "Contact Center (Python 3.13)"
# Then select the "Contact Center (Python 3.13)" kernel in Jupyter.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from collections import Counter
import textwrap
import warnings

warnings.filterwarnings("ignore")
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams["figure.dpi"] = 120
plt.rcParams["figure.figsize"] = (12, 5)

print("Libraries loaded.")

In [None]:
# Download dataset from HuggingFace
dataset = load_dataset(
    "bitext/Bitext-customer-support-llm-chatbot-training-dataset",
    split="train"
)
df = dataset.to_pandas()
print(f"Dataset downloaded: {df.shape[0]:,} rows, {df.shape[1]} columns")

## 2. Basic Overview

In [None]:
# First look at the data
df.head(3)

In [None]:
# Shape, dtypes, memory usage
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB")
print()
df.info()

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({"Missing": missing, "Pct (%)": missing_pct})
print("Missing values per column:")
missing_df

In [None]:
# Duplicates check
n_dup_rows = df.duplicated().sum()
n_dup_instructions = df["instruction"].duplicated().sum()
print(f"Fully duplicate rows:        {n_dup_rows:,}")
print(f"Duplicate instructions only:  {n_dup_instructions:,}")

In [None]:
# Unique value counts per column
for col in df.columns:
    print(f"{col:20s}  unique={df[col].nunique():,}")

## 3. Category Analysis

In [None]:
# Category distribution
cat_counts = df["category"].value_counts()
print(f"Number of categories: {cat_counts.shape[0]}")
print()
print(cat_counts.to_string())

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
cat_counts.plot.barh(ax=ax, color=sns.color_palette("viridis", len(cat_counts)))
ax.set_xlabel("Number of Samples")
ax.set_title("Samples per Category")
ax.invert_yaxis()
for i, v in enumerate(cat_counts.values):
    ax.text(v + 50, i, f"{v:,}", va="center", fontsize=9)
plt.tight_layout()
plt.show()

## 4. Intent Analysis

In [None]:
# Intent distribution
intent_counts = df["intent"].value_counts()
print(f"Number of unique intents: {intent_counts.shape[0]}")
print()
print("Top 20 intents:")
print(intent_counts.head(20).to_string())
print()
print("Bottom 10 intents:")
print(intent_counts.tail(10).to_string())

In [None]:
# Top 25 intents bar chart
fig, ax = plt.subplots(figsize=(12, 8))
intent_counts.head(25).plot.barh(ax=ax, color=sns.color_palette("crest", 25))
ax.set_xlabel("Number of Samples")
ax.set_title("Top 25 Intents by Frequency")
ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Intents per category
intents_per_cat = df.groupby("category")["intent"].nunique().sort_values(ascending=False)
print("Unique intents per category:")
print(intents_per_cat.to_string())

In [None]:
# Intent balance: min/max/mean samples per intent
print(f"Samples per intent — min: {intent_counts.min()}, max: {intent_counts.max()}, "
      f"mean: {intent_counts.mean():.1f}, median: {intent_counts.median():.1f}")

fig, ax = plt.subplots(figsize=(10, 4))
intent_counts.values[::-1]
ax.hist(intent_counts.values, bins=30, edgecolor="white", color="steelblue")
ax.set_xlabel("Samples per Intent")
ax.set_ylabel("Number of Intents")
ax.set_title("Distribution of Samples per Intent (Class Balance)")
plt.tight_layout()
plt.show()

## 5. Flags / Language Tags Analysis

In [None]:
# Flags distribution
flags_counts = df["flags"].value_counts()
print(f"Unique flag values: {flags_counts.shape[0]}")
print()
print(flags_counts.to_string())

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
flags_counts.plot.bar(ax=ax, color=sns.color_palette("Set2", len(flags_counts)))
ax.set_ylabel("Count")
ax.set_title("Distribution of Language Flags")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
for i, v in enumerate(flags_counts.values):
    ax.text(i, v + 100, f"{v:,}", ha="center", fontsize=9)
plt.tight_layout()
plt.show()

In [None]:
# Cross-tab: flags vs category (heatmap)
ct = pd.crosstab(df["category"], df["flags"])
fig, ax = plt.subplots(figsize=(12, 6))
sns.heatmap(ct, annot=True, fmt="d", cmap="YlOrRd", linewidths=0.5, ax=ax)
ax.set_title("Category × Flags Heatmap")
plt.tight_layout()
plt.show()

## 6. Text Length Analysis

In [None]:
# Compute text lengths
df["instruction_len"] = df["instruction"].str.len()
df["response_len"] = df["response"].str.len()
df["instruction_word_count"] = df["instruction"].str.split().str.len()
df["response_word_count"] = df["response"].str.split().str.len()

len_stats = df[["instruction_len", "response_len", "instruction_word_count", "response_word_count"]].describe()
len_stats.round(1)

In [None]:
# Instruction vs Response length distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df["instruction_len"], bins=50, edgecolor="white", color="#4C72B0", alpha=0.8)
axes[0].set_title("Instruction Length (chars)")
axes[0].set_xlabel("Characters")
axes[0].set_ylabel("Frequency")

axes[1].hist(df["response_len"], bins=50, edgecolor="white", color="#DD8452", alpha=0.8)
axes[1].set_title("Response Length (chars)")
axes[1].set_xlabel("Characters")
axes[1].set_ylabel("Frequency")

plt.suptitle("Character Length Distributions", fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Word count distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df["instruction_word_count"], bins=40, edgecolor="white", color="#4C72B0", alpha=0.8)
axes[0].set_title("Instruction Word Count")
axes[0].set_xlabel("Words")
axes[0].set_ylabel("Frequency")

axes[1].hist(df["response_word_count"], bins=40, edgecolor="white", color="#DD8452", alpha=0.8)
axes[1].set_title("Response Word Count")
axes[1].set_xlabel("Words")
axes[1].set_ylabel("Frequency")

plt.suptitle("Word Count Distributions", fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Instruction length vs Response length scatter
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(df["instruction_word_count"], df["response_word_count"],
           alpha=0.15, s=10, color="steelblue")
ax.set_xlabel("Instruction Word Count")
ax.set_ylabel("Response Word Count")
ax.set_title("Instruction vs Response Word Count")
plt.tight_layout()
plt.show()

corr = df["instruction_word_count"].corr(df["response_word_count"])
print(f"Pearson correlation (instruction words vs response words): {corr:.3f}")

In [None]:
# Average instruction & response length by category
len_by_cat = df.groupby("category").agg(
    avg_instr_words=("instruction_word_count", "mean"),
    avg_resp_words=("response_word_count", "mean")
).sort_values("avg_resp_words", ascending=False)

fig, ax = plt.subplots(figsize=(12, 6))
len_by_cat.plot.barh(ax=ax)
ax.set_xlabel("Average Word Count")
ax.set_title("Average Instruction & Response Length by Category")
ax.invert_yaxis()
ax.legend(["Instruction", "Response"])
plt.tight_layout()
plt.show()

In [None]:
# Average text length by flag
len_by_flag = df.groupby("flags").agg(
    avg_instr_words=("instruction_word_count", "mean"),
    avg_resp_words=("response_word_count", "mean")
).sort_values("avg_instr_words", ascending=False)

fig, ax = plt.subplots(figsize=(10, 5))
len_by_flag.plot.bar(ax=ax)
ax.set_ylabel("Average Word Count")
ax.set_title("Average Instruction & Response Length by Flag")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
ax.legend(["Instruction", "Response"])
plt.tight_layout()
plt.show()

## 7. Word Frequency & Word Clouds

In [None]:
from wordcloud import WordCloud

# Top words in instructions (excluding stop words via wordcloud)
instr_text = " ".join(df["instruction"].str.lower())
resp_text = " ".join(df["response"].str.lower())

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

wc_instr = WordCloud(width=800, height=400, background_color="white",
                     colormap="viridis", max_words=100).generate(instr_text)
axes[0].imshow(wc_instr, interpolation="bilinear")
axes[0].set_title("Instructions — Word Cloud", fontsize=14)
axes[0].axis("off")

wc_resp = WordCloud(width=800, height=400, background_color="white",
                    colormap="magma", max_words=100).generate(resp_text)
axes[1].imshow(wc_resp, interpolation="bilinear")
axes[1].set_title("Responses — Word Cloud", fontsize=14)
axes[1].axis("off")

plt.tight_layout()
plt.show()

In [None]:
# Top 20 words in instructions (simple tokenization, filtering short words)
from collections import Counter

stop_words = {"i", "me", "my", "the", "a", "an", "is", "to", "of", "and", "in",
              "it", "for", "on", "with", "that", "this", "can", "you", "do",
              "be", "have", "was", "are", "has", "or", "at", "by", "if", "not",
              "from", "but", "so", "we", "they", "what", "how", "about", "there"}

instr_words = [w for w in instr_text.split() if len(w) > 2 and w not in stop_words]
top_instr = Counter(instr_words).most_common(20)

resp_words = [w for w in resp_text.split() if len(w) > 2 and w not in stop_words]
top_resp = Counter(resp_words).most_common(20)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

words, counts = zip(*top_instr)
axes[0].barh(words[::-1], counts[::-1], color="#4C72B0")
axes[0].set_title("Top 20 Words in Instructions")
axes[0].set_xlabel("Frequency")

words, counts = zip(*top_resp)
axes[1].barh(words[::-1], counts[::-1], color="#DD8452")
axes[1].set_title("Top 20 Words in Responses")
axes[1].set_xlabel("Frequency")

plt.tight_layout()
plt.show()

## 8. Response Diversity Analysis

In [None]:
# How many unique responses per intent?
resp_per_intent = df.groupby("intent")["response"].nunique().sort_values(ascending=False)
print(f"Unique responses per intent — min: {resp_per_intent.min()}, max: {resp_per_intent.max()}, "
      f"mean: {resp_per_intent.mean():.1f}")
print()
print("Top 10 intents by response diversity:")
print(resp_per_intent.head(10).to_string())
print()
print("Bottom 10 intents by response diversity:")
print(resp_per_intent.tail(10).to_string())

In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
ax.hist(resp_per_intent.values, bins=30, edgecolor="white", color="#55A868")
ax.set_xlabel("Unique Responses")
ax.set_ylabel("Number of Intents")
ax.set_title("Distribution of Unique Responses per Intent")
plt.tight_layout()
plt.show()

In [None]:
# Unique instructions per intent
instr_per_intent = df.groupby("intent")["instruction"].nunique().sort_values(ascending=False)
print(f"Unique instructions per intent — min: {instr_per_intent.min()}, max: {instr_per_intent.max()}, "
      f"mean: {instr_per_intent.mean():.1f}")

fig, ax = plt.subplots(figsize=(10, 4))
ax.hist(instr_per_intent.values, bins=30, edgecolor="white", color="#4C72B0")
ax.set_xlabel("Unique Instructions")
ax.set_ylabel("Number of Intents")
ax.set_title("Distribution of Unique Instructions per Intent (Paraphrase Richness)")
plt.tight_layout()
plt.show()

## 9. Sample Inspection

In [None]:
# Show a few random examples per category
sample_cats = df["category"].unique()[:5]
for cat in sample_cats:
    print(f"\n{'='*80}")
    print(f"CATEGORY: {cat}")
    print(f"{'='*80}")
    samples = df[df["category"] == cat].sample(2, random_state=42)
    for _, row in samples.iterrows():
        print(f"\n  Intent: {row['intent']}")
        print(f"  Flag:   {row['flags']}")
        print(f"  User:   {row['instruction']}")
        wrapped = textwrap.fill(row['response'], width=80, initial_indent='  Bot:    ', subsequent_indent='          ')
        print(wrapped)

## 10. Summary & Key Findings

In [None]:
print("=" * 60)
print("  EDA SUMMARY")
print("=" * 60)
print(f"  Total samples:             {len(df):,}")
print(f"  Columns:                   {list(df.columns[:5])}")
print(f"  Categories:                {df['category'].nunique()}")
print(f"  Intents:                   {df['intent'].nunique()}")
print(f"  Unique flags:              {df['flags'].nunique()}")
print(f"  Duplicate rows:            {df.duplicated().sum():,}")
print(f"  Missing values:            {df.isnull().sum().sum():,}")
print(f"  Avg instruction words:     {df['instruction_word_count'].mean():.1f}")
print(f"  Avg response words:        {df['response_word_count'].mean():.1f}")
print(f"  Unique instructions:       {df['instruction'].nunique():,}")
print(f"  Unique responses:          {df['response'].nunique():,}")
print("=" * 60)