## Strategy Summary

This notebook follows an optimal strategy for preprocessing customer-agent dialogues for sentiment classification:

- Retains `customer:` and `agent:` tags to preserve speaker roles.
- Removes generic agent greetings to avoid introducing bias.
- Strips noise: punctuation, emails, phone numbers, URLs, and excess whitespace.
- Removes known polite/boilerplate phrases that add no sentiment signal.
- Fixes common spelling issues that harm tokenization.
- Splits the dataset using stratified sampling to balance sentiment classes.
        

In [1]:
import pandas as pd
import re
import string
import os
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset and encode labels
df = pd.read_csv("data/train.csv")[["customer_sentiment", "conversation"]].dropna()
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['customer_sentiment'].map(sentiment_map)

In [3]:
custom_phrases = [
    r'\bis there anything else i can (assist|help) you with\b',
    r'\bthank you for choosing brownbox\b',
    r'\bthank you\b',
    r'\byoure welcome\b',
    r'\btake care\b',
    r'\bgoodbye\b',
    r'\bplease\b',
    r'\bthanks\b',
    r'\bsure\b',
    r'\bno thats all\b',
    r'\bhave a (nice|great|good) day\b',
    r'\bappreciate\b',
    r'\bfor contacting brownbox customer support\b'
]

misspellings = {
    'ts': 'this', 'witn': 'within', 'anytng': 'anything',
    'ithis': 'it has', 'thathis': 'that is', 'as you': 'assure you',
    'en that': 'ensure that'
}

def clean_conversation(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\+?\d[\d\s\-().]{8,}\d', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)

    lines = text.strip().split('\n')

    if lines and lines[0].startswith("agent:"):
        if any(greet in lines[0] for greet in [
            "thank you for calling", "hi", "hello", "this is", "my name is",
            "how can i help you", "how may i assist you"
        ]):
            lines = lines[1:]

    text = ' '.join(lines)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()

    for phrase in custom_phrases:
        text = re.sub(phrase, '', text)

    for wrong, right in misspellings.items():
        text = text.replace(wrong, right)

    # Remove "customer" or "agent" if they are at the end of the line
    text = re.sub(r'\b(customer|agent)\b\s*$', '', text)

    return text

In [4]:
# Apply cleaning function
df['conversation'] = df['conversation'].astype(str).apply(clean_conversation)

In [5]:
# Stratified train/validation split
train_df, val_df = train_test_split(
    df[['conversation', 'label']], test_size=0.2, stratify=df['label'], random_state=42
)

In [6]:
# Save output
os.makedirs("data", exist_ok=True)
train_df.to_csv("data/train_cleaned.csv", index=False)
val_df.to_csv("data/val_cleaned.csv", index=False)

# Check class distributions
print("📊 Train set distribution:")
print(train_df['label'].value_counts())  # actual count
print(train_df['label'].value_counts(normalize=True).round(3))  # proportions

print("\n📊 Validation set distribution:")
print(val_df['label'].value_counts())
print(val_df['label'].value_counts(normalize=True).round(3))

📊 Train set distribution:
label
1    434
0    329
2     13
Name: count, dtype: int64
label
1    0.559
0    0.424
2    0.017
Name: proportion, dtype: float64

📊 Validation set distribution:
label
1    108
0     82
2      4
Name: count, dtype: int64
label
1    0.557
0    0.423
2    0.021
Name: proportion, dtype: float64
