# Support Ticket Multi-Class Classifier
**Categories:** Billing | Technical | Account  
**Method:** TF-IDF + Logistic Regression

**Key Improvements:**
- ✅ Label leakage mitigation: exclude weak supervision keywords from TF-IDF features
- ✅ Cross-validation for robust evaluation
- ✅ Train/Validation/Test split
- ✅ Stopword removal
- ✅ Stemming/Lemmatization
- ✅ Emoji and non-English text handling

In [None]:
import pandas as pd
import numpy as np
import re
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import joblib
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('stopwords', quiet=True)
print("Libraries loaded successfully!")

## 1. Load and Filter Data

In [None]:
# Load dataset
df = pd.read_csv('twcs.csv')
print(f"Total records: {len(df)}")

# Filter inbound messages (customer messages only)
df = df[df['inbound'] == True].copy()
print(f"Inbound messages: {len(df)}")

## 2. Weak Supervision Labeling

**Label Leakage Mitigation Strategy:**
- Use diverse, contextual keywords for labeling
- Later exclude these exact labeling keywords from TF-IDF feature space
- This forces the model to learn from context and patterns, not just keyword matching

In [None]:
# Define labeling keywords (will be excluded from features later)
LABELING_KEYWORDS = {
    'billing': ['bill', 'charge', 'payment', 'invoice', 'refund', 'subscription', 
                'pay', 'card', 'credit', 'price', 'cost', 'fee', 'receipt', 'paid',
                'transaction', 'billing', 'overcharge'],
    
    'technical': ['error', 'crash', 'bug', 'issue', 'problem', 'broken',
                  'slow', 'loading', 'update', 'ios', 'android', 'app', 'website',
                  'battery', 'freeze', 'lag', 'download', 'install', 'version',
                  'device', 'glitch', 'wifi', 'connection'],
    
    'account': ['account', 'password', 'login', 'username', 'profile',
                'reset', 'verify', 'access', 'locked', 'deactivate', 'email',
                'register', 'authentication', 'security', 'settings']
}

def assign_label(text):
    """Assign label using keyword-based weak supervision. Returns None for ties."""
    if pd.isna(text):
        return None
    
    text_lower = text.lower()
    
    billing_score = sum(1 for kw in LABELING_KEYWORDS['billing'] if kw in text_lower)
    technical_score = sum(1 for kw in LABELING_KEYWORDS['technical'] if kw in text_lower)
    account_score = sum(1 for kw in LABELING_KEYWORDS['account'] if kw in text_lower)
    
    scores = [billing_score, technical_score, account_score]
    max_score = max(scores)
    
    # No match or tie -> None
    if max_score == 0 or scores.count(max_score) > 1:
        return None
    
    if billing_score == max_score:
        return 'Billing'
    elif technical_score == max_score:
        return 'Technical'
    else:
        return 'Account'

df['label'] = df['text'].apply(assign_label)

# Drop rows with no label
df = df[df['label'].notna()].copy()
print(f"\nLabeled records: {len(df)}")
print(f"\nLabel distribution:\n{df['label'].value_counts()}")

## 3. Enhanced Text Cleaning

**Improvements:**
- Convert emojis to text descriptions
- Remove non-English characters
- Remove stopwords
- Apply stemming

In [None]:
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    if pd.isna(text):
        return ""
    
    # Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Keep only English characters and numbers
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert to lowercase
    text = text.lower().strip()
    
    # Tokenize, remove stopwords, and stem
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) > 2]
    
    # Remove extra whitespace
    text = ' '.join(tokens)
    
    return text

df['text_clean'] = df['text'].apply(clean_text)
df = df[df['text_clean'].str.len() > 0].copy()
print(f"Records after cleaning: {len(df)}")
print(f"\nExample cleaned text:")
print(df[['text', 'text_clean', 'label']].head(3))

## 4. Train/Validation/Test Split

**Improved data split:**
- Use larger sample (50k instead of 20k)
- Three-way split: 70% train, 15% validation, 15% test
- Stratified sampling

In [None]:
# Use larger sample for better generalization
SAMPLE_SIZE = min(50000, len(df))
df_sample = df.sample(n=SAMPLE_SIZE, random_state=42)

X = df_sample['text_clean']
y = df_sample['label']

# First split: train+val (85%) vs test (15%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Second split: train (70% total) vs validation (15% total)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp  # 0.176 * 0.85 ≈ 0.15
)

print(f"Training:   {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation: {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test:       {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

## 5. Custom TF-IDF Vectorizer (Label Leakage Mitigation)

**Critical fix:** Exclude labeling keywords from TF-IDF features to prevent circular reasoning

In [None]:
# Flatten all labeling keywords
all_labeling_keywords = set()
for keywords in LABELING_KEYWORDS.values():
    all_labeling_keywords.update([stemmer.stem(kw) for kw in keywords])

print(f"Excluding {len(all_labeling_keywords)} labeling keywords from features")
print(f"Sample excluded keywords: {list(all_labeling_keywords)[:10]}")

# Custom stop words: English stopwords + labeling keywords
custom_stop_words = list(stop_words) + list(all_labeling_keywords)

# TF-IDF with bigrams, excluding labeling keywords
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.7,
    stop_words=custom_stop_words
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

print(f"\nTF-IDF shape: {X_train_tfidf.shape}")
print(f"Features extracted: {len(vectorizer.get_feature_names_out())}")

## 6. Train Model with Cross-Validation

In [None]:
# Train model
model = LogisticRegression(
    max_iter=2000,
    solver='lbfgs',
    class_weight='balanced',
    random_state=42
)

print("Training model...")
model.fit(X_train_tfidf, y_train)
print("Training complete!")

# Cross-validation on training set
print("\nRunning 5-Fold Cross-Validation...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=cv, scoring='f1_macro')

print(f"\nCross-Validation F1-Scores: {cv_scores}")
print(f"Mean CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## 7. Validation Set Performance

In [None]:
y_val_pred = model.predict(X_val_tfidf)

val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred, average='macro')

print("="*60)
print("VALIDATION SET PERFORMANCE")
print("="*60)
print(f"Accuracy:  {val_accuracy:.4f}")
print(f"Macro F1:  {val_f1:.4f}")
print("\n" + classification_report(y_val, y_val_pred))

## 8. Test Set Evaluation (Final Metrics)

In [None]:
y_test_pred = model.predict(X_test_tfidf)

# Fixed label order
label_order = ['Billing', 'Technical', 'Account']

# Metrics
accuracy = accuracy_score(y_test, y_test_pred)
macro_precision = precision_score(y_test, y_test_pred, average='macro', labels=label_order)
macro_recall = recall_score(y_test, y_test_pred, average='macro', labels=label_order)
macro_f1 = f1_score(y_test, y_test_pred, average='macro', labels=label_order)

print("="*60)
print("TEST SET PERFORMANCE (FINAL METRICS)")
print("="*60)
print(f"Accuracy:           {accuracy:.4f}")
print(f"Macro Precision:    {macro_precision:.4f}")
print(f"Macro Recall:       {macro_recall:.4f}")
print(f"Macro F1-Score:     {macro_f1:.4f}")
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_test_pred, labels=label_order))
print("="*60)
print("CONFUSION MATRIX")
print("="*60)
print(confusion_matrix(y_test, y_test_pred, labels=label_order))
print(f"\nLabel order: {label_order}")

## 9. Export Model

In [None]:
# Create pipeline for deployment
pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('clf', model)
])

joblib.dump(pipeline, 'model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("✓ Model saved as model.pkl")
print("✓ Vectorizer saved as tfidf_vectorizer.pkl")

## 10. Demo: Predict on sample.csv

In [None]:
# Load sample
df_demo = pd.read_csv('sample.csv')
df_demo['text_clean'] = df_demo['text'].apply(clean_text)
df_demo = df_demo[df_demo['text_clean'].str.len() > 0]

# Predict first 10
demo_texts = df_demo['text'].head(10).values
demo_clean = df_demo['text_clean'].head(10).values
demo_tfidf = vectorizer.transform(demo_clean)
predictions = model.predict(demo_tfidf)

print("="*80)
print("DEMO PREDICTIONS (sample.csv - First 10 rows)")
print("="*80)
for i, (text, pred) in enumerate(zip(demo_texts, predictions), 1):
    print(f"\n[{i}] {text[:70]}...")
    print(f"    → {pred}")
print("\n" + "="*80)

## Summary of Improvements

✅ **Label Leakage Mitigation**: Excluded all weak supervision keywords from TF-IDF features

✅ **Validation Set**: Train/Val/Test split (70/15/15) for proper model monitoring

✅ **Cross-Validation**: 5-fold stratified CV for robust performance estimation

✅ **Larger Sample**: Increased from 20k to 50k samples

✅ **Stopword Removal**: Using NLTK stopwords

✅ **Stemming**: Porter Stemmer for normalization

✅ **Emoji Handling**: Convert emojis to text descriptions

✅ **Non-English Text**: Filtering to English characters only