In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load dataset
df = pd.read_csv('twcs.csv')
print(f"Total records: {len(df)}")

# Filter inbound messages (customer messages only)
df = df[df['inbound'] == True].copy()
print(f"Inbound messages: {len(df)}")

Total records: 2811774
Inbound messages: 1537843


In [3]:
def assign_label(text):
    """Assign label using keyword-based weak supervision. Returns None for ties."""
    if pd.isna(text):
        return None
    
    text_lower = text.lower()
    
    billing_kw = ['bill', 'charge', 'payment', 'invoice', 'refund', 'subscription', 
                  'pay', 'card', 'credit', 'price', 'cost', 'fee', 'receipt']
    
    technical_kw = ['error', 'crash', 'bug', 'issue', 'problem', 'not working', 'broken',
                    'slow', 'loading', 'update', 'ios', 'android', 'app', 'website',
                    'battery', 'freeze', 'lag', 'download', 'install', 'version']
    
    account_kw = ['account', 'password', 'login', 'sign in', 'username', 'profile',
                  'reset', 'verify', 'access', 'locked', 'deactivate', 'email',
                  'register', 'authentication', 'security']
    
    billing_score = sum(1 for kw in billing_kw if kw in text_lower)
    technical_score = sum(1 for kw in technical_kw if kw in text_lower)
    account_score = sum(1 for kw in account_kw if kw in text_lower)
    
    scores = [billing_score, technical_score, account_score]
    max_score = max(scores)
    
    # No match or tie -> None
    if max_score == 0 or scores.count(max_score) > 1:
        return None
    
    if billing_score == max_score:
        return 'Billing'
    elif technical_score == max_score:
        return 'Technical'
    else:
        return 'Account'

df['label'] = df['text'].apply(assign_label)

# Drop rows with no label
df = df[df['label'].notna()].copy()
print(f"\nLabeled records: {len(df)}")
print(f"\nLabel distribution:\n{df['label'].value_counts()}")


Labeled records: 533572

Label distribution:
label
Technical    325085
Billing      139556
Account       68931
Name: count, dtype: int64


In [4]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|#\w+', '', text)  # Remove mentions/hashtags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special chars
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

df['text_clean'] = df['text'].apply(clean_text)
df = df[df['text_clean'].str.len() > 0].copy()
print(f"Records after cleaning: {len(df)}")

Records after cleaning: 531781


In [5]:
# Sample for faster training
SAMPLE_SIZE = min(20000, len(df))
df_sample = df.sample(n=SAMPLE_SIZE, random_state=42)

X = df_sample['text_clean']
y = df_sample['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training: {len(X_train)} | Test: {len(X_test)}")

Training: 16000 | Test: 4000


In [6]:
# Create Pipeline with TF-IDF + Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.8)),
    ('clf', LogisticRegression(max_iter=2000, solver='lbfgs', class_weight='balanced', random_state=42))
])

print("Training model...")
pipeline.fit(X_train, y_train)
print("Training complete!")

Training model...
Training complete!


In [7]:
y_pred = pipeline.predict(X_test)

In [8]:
# Fixed label order
label_order = ['Billing', 'Technical', 'Account']

# Metrics
accuracy = accuracy_score(y_test, y_pred)
macro_precision = precision_score(y_test, y_pred, average='macro', labels=label_order)
macro_recall = recall_score(y_test, y_pred, average='macro', labels=label_order)
macro_f1 = f1_score(y_test, y_pred, average='macro', labels=label_order)

print("="*60)
print("CLASSIFICATION METRICS")
print("="*60)
print(f"Accuracy:           {accuracy:.4f}")
print(f"Macro Precision:    {macro_precision:.4f}")
print(f"Macro Recall:       {macro_recall:.4f}")
print(f"Macro F1-Score:     {macro_f1:.4f}")
print("\n" + "="*60)
print("CLASSIFICATION REPORT")
print("="*60)
print(classification_report(y_test, y_pred, labels=label_order))
print("="*60)
print("CONFUSION MATRIX")
print("="*60)
print(confusion_matrix(y_test, y_pred, labels=label_order))
print(f"\nLabel order: {label_order}")

CLASSIFICATION METRICS
Accuracy:           0.9457
Macro Precision:    0.9386
Macro Recall:       0.9390
Macro F1-Score:     0.9388

CLASSIFICATION REPORT
              precision    recall  f1-score   support

     Billing       0.91      0.93      0.92      1062
   Technical       0.96      0.96      0.96      2434
     Account       0.95      0.93      0.94       504

    accuracy                           0.95      4000
   macro avg       0.94      0.94      0.94      4000
weighted avg       0.95      0.95      0.95      4000

CONFUSION MATRIX
[[ 983   67   12]
 [  90 2329   15]
 [  11   22  471]]

Label order: ['Billing', 'Technical', 'Account']


In [9]:
joblib.dump(pipeline, 'model.pkl')
print("âœ“ Model saved as model.pkl")

âœ“ Model saved as model.pkl


In [10]:
# Load sample
df_demo = pd.read_csv('sample.csv')
df_demo['text_clean'] = df_demo['text'].apply(clean_text)
df_demo = df_demo[df_demo['text_clean'].str.len() > 0]

# Predict first 10
demo_texts = df_demo['text'].head(10).values
demo_clean = df_demo['text_clean'].head(10).values
predictions = pipeline.predict(demo_clean)

print("="*80)
print("DEMO PREDICTIONS (sample.csv - First 10 rows)")
print("="*80)
for i, (text, pred) in enumerate(zip(demo_texts, predictions), 1):
    print(f"\n[{i}] {text[:70]}...")
    print(f"    â†’ {pred}")
print("\n" + "="*80)

DEMO PREDICTIONS (sample.csv - First 10 rows)

[1] @AppleSupport causing the reply to be disregarded and the tapped notif...
    â†’ Technical

[2] @105835 Your business means a lot to us. Please DM your name, zip code...
    â†’ Technical

[3] @76328 I really hope you all change but I'm sure you won't! Because yo...
    â†’ Technical

[4] @105836 LiveChat is online at the moment - https://t.co/SY94VtU8Kq or ...
    â†’ Technical

[5] @VirginTrains see attached error message. I've tried leaving a voicema...
    â†’ Technical

[6] @105836 Have you tried from another device, Miriam ^MM...
    â†’ Technical

[7] @VirginTrains yep, I've tried laptop too several times over the past w...
    â†’ Technical

[8] @105836 It's working OK from here, Miriam. Does this link help https:/...
    â†’ Technical

[9] @VirginTrains I still haven't heard &amp; the number I'm directed to b...
    â†’ Account

[10] @105836 That's what we're here for Miriam ðŸ˜Š  The team should send you ...
    â†’ Account
