# False Positive Brand Classifier

This notebook develops a classifier to detect whether an article mentioning a brand name is actually about the sportswear company or something else (e.g., "Puma" the animal, "Patagonia" the region).

## Objective
Replace Claude's false positive detection with a cost-efficient ML classifier that can filter articles before expensive LLM labeling.

## Contents
1. [Data Loading & Exploration](#1-data-loading--exploration)
2. [Target Variable Analysis](#2-target-variable-analysis)
3. [Text Analysis & EDA](#3-text-analysis--eda)
4. [Data Preprocessing](#4-data-preprocessing)
5. [Train/Validation/Test Split](#5-trainvalidationtest-split)
6. [Baseline Models](#6-baseline-models)
7. [Hyperparameter Tuning](#7-hyperparameter-tuning)
8. [Model Selection & Final Evaluation](#8-model-selection--final-evaluation)
9. [Conclusions](#9-conclusions)

## Setup

In [None]:
# Standard imports
import sys
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Sklearn imports
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Project imports
from src.fp1_nb.data_utils import (
    load_jsonl_data,
    analyze_target_stats,
    plot_target_distribution,
    split_train_val_test,
)
from src.fp1_nb.eda_utils import (
    analyze_text_length_stats,
    plot_text_length_distributions,
    analyze_brand_distribution,
    plot_brand_distribution,
    analyze_word_frequencies,
)
from src.fp1_nb.preprocessing import (
    clean_text,
    create_text_features,
    build_tfidf_pipeline,
)
from src.fp1_nb.modeling import (
    create_search_object,
    tune_with_logging,
    get_best_params_summary,
    compare_models,
    get_best_model,
    evaluate_model,
    compare_val_test_performance,
)

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)
plt.style.use('seaborn-v0_8-whitegrid')

# Random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("Setup complete!")

## 1. Data Loading & Exploration

In [None]:
# Load the FP training data
DATA_PATH = project_root / 'data' / 'fp_training_data.jsonl'
df = load_jsonl_data(DATA_PATH)

In [None]:
# Preview the data
df.head()

In [None]:
# Data types and info
df.info()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

In [None]:
# Examine the source distribution
print("\nSource distribution:")
print(df['source'].value_counts())

## 2. Target Variable Analysis

In [None]:
# Define target column and labels
TARGET_COL = 'is_sportswear'
LABEL_NAMES = ['Not Sportswear (FP)', 'Sportswear']

# Analyze target distribution
target_stats = analyze_target_stats(
    df, 
    TARGET_COL, 
    label_names=LABEL_NAMES,
    imbalance_threshold=5.0,
    save_path='images/fp_target_distribution.png'
)

## 3. Text Analysis & EDA

### 3.1 Text Length Analysis

In [None]:
# Analyze text lengths
text_stats = analyze_text_length_stats(df, 'content', TARGET_COL)

In [None]:
# Plot text length distributions by class
plot_text_length_distributions(
    df, 'content', TARGET_COL,
    label_names={0: 'Not Sportswear', 1: 'Sportswear'},
    save_path='images/fp_text_length_dist.png'
)

### 3.2 Brand Distribution

In [None]:
# Analyze brand distribution
brand_counts = analyze_brand_distribution(df, 'brands', TARGET_COL, top_n=15)

In [None]:
# Plot brand distribution by class
plot_brand_distribution(
    df, 'brands', TARGET_COL,
    label_names={0: 'Not Sportswear', 1: 'Sportswear'},
    top_n=12,
    figsize=(14, 5),
    save_path='images/fp_brand_dist.png'
)

### 3.3 Word Frequency Analysis

In [None]:
# Analyze word frequencies by class
word_freqs = analyze_word_frequencies(df, 'content', TARGET_COL, top_n=15)

In [None]:
# Look at some false positive examples
print("\n" + "="*60)
print("SAMPLE FALSE POSITIVE ARTICLES")
print("="*60)

fp_samples = df[df[TARGET_COL] == 0].sample(3, random_state=RANDOM_STATE)
for _, row in fp_samples.iterrows():
    print(f"\nBrands: {row['brands']}")
    print(f"Title: {row['title'][:100]}...")
    if 'fp_reason' in row and pd.notna(row['fp_reason']):
        print(f"Reason: {row['fp_reason']}")
    print("-" * 40)

## 4. Data Preprocessing

In [None]:
# Create combined text features
# Include title (weighted), brands, and content
df['text_features'] = create_text_features(
    df,
    text_col='content',
    title_col='title',
    brands_col='brands',
    clean_func=clean_text
)

print("Text features created!")
print(f"Sample:\n{df['text_features'].iloc[0][:500]}...")

In [None]:
# Check for empty text features
empty_texts = (df['text_features'].str.len() == 0).sum()
print(f"Records with empty text features: {empty_texts}")

# Remove if any
if empty_texts > 0:
    df = df[df['text_features'].str.len() > 0].copy()
    print(f"Remaining records: {len(df)}")

## 5. Train/Validation/Test Split

In [None]:
# Split with stratification
train_df, val_df, test_df = split_train_val_test(
    df,
    target_col=TARGET_COL,
    train_ratio=0.6,
    val_ratio=0.2,
    test_ratio=0.2,
    random_state=RANDOM_STATE
)

In [None]:
# Prepare feature and target arrays
X_train = train_df['text_features']
y_train = train_df[TARGET_COL]

X_val = val_df['text_features']
y_val = val_df[TARGET_COL]

X_test = test_df['text_features']
y_test = test_df[TARGET_COL]

print(f"X_train shape: {len(X_train)}")
print(f"X_val shape: {len(X_val)}")
print(f"X_test shape: {len(X_test)}")

## 6. Baseline Models

In [None]:
# Build TF-IDF vectorizer (fit on training data only)
tfidf_pipeline = build_tfidf_pipeline(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

# Fit and transform
X_train_tfidf = tfidf_pipeline.fit_transform(X_train)
X_val_tfidf = tfidf_pipeline.transform(X_val)
X_test_tfidf = tfidf_pipeline.transform(X_test)

print(f"TF-IDF features: {X_train_tfidf.shape[1]}")

In [None]:
# Define baseline models
baseline_models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000, 
        random_state=RANDOM_STATE,
        class_weight='balanced'
    ),
    'Naive Bayes': MultinomialNB(),
    'Linear SVM': CalibratedClassifierCV(
        LinearSVC(max_iter=2000, random_state=RANDOM_STATE, class_weight='balanced'),
        cv=3
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_jobs=-1
    ),
}

In [None]:
# Train and evaluate baseline models
baseline_results = []

for name, model in baseline_models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)
    
    # Evaluate on validation set
    metrics = evaluate_model(
        model, X_val_tfidf, y_val,
        model_name=name,
        dataset_name='Validation',
        verbose=True,
        plot=False
    )
    baseline_results.append(metrics)

In [None]:
# Compare baseline models
baseline_comparison = compare_models(
    baseline_results,
    metrics_to_display=['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'pr_auc'],
    title='Baseline Model Comparison (Validation Set)',
    save_path='images/fp_baseline_comparison.png'
)

## 7. Hyperparameter Tuning

In [None]:
# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

### 7.1 Logistic Regression Tuning

In [None]:
# Logistic Regression parameter grid
lr_param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],
    'class_weight': ['balanced', None],
}

lr_search = create_search_object(
    search_type='grid',
    estimator=LogisticRegression(max_iter=2000, random_state=RANDOM_STATE),
    param_grid=lr_param_grid,
    cv=cv,
    refit='average_precision'
)

lr_search, lr_log, lr_csv = tune_with_logging(
    lr_search, X_train_tfidf, y_train,
    model_name='logistic_regression'
)

In [None]:
# Get best LR parameters
lr_summary = get_best_params_summary(lr_search, 'Logistic Regression')

### 7.2 Random Forest Tuning

In [None]:
# Random Forest parameter grid
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', 'balanced_subsample'],
}

rf_search = create_search_object(
    search_type='grid',
    estimator=RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    param_grid=rf_param_grid,
    cv=cv,
    refit='average_precision'
)

rf_search, rf_log, rf_csv = tune_with_logging(
    rf_search, X_train_tfidf, y_train,
    model_name='random_forest'
)

In [None]:
# Get best RF parameters
rf_summary = get_best_params_summary(rf_search, 'Random Forest')

### 7.3 Gradient Boosting Tuning

In [None]:
# Gradient Boosting parameter grid (smaller for speed)
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'subsample': [0.8, 1.0],
}

gb_search = create_search_object(
    search_type='grid',
    estimator=GradientBoostingClassifier(random_state=RANDOM_STATE),
    param_grid=gb_param_grid,
    cv=cv,
    refit='average_precision'
)

gb_search, gb_log, gb_csv = tune_with_logging(
    gb_search, X_train_tfidf, y_train,
    model_name='gradient_boosting'
)

In [None]:
# Get best GB parameters
gb_summary = get_best_params_summary(gb_search, 'Gradient Boosting')

### 7.4 Compare Tuned Models

In [None]:
# Evaluate tuned models on validation set
tuned_models = {
    'LR_tuned': lr_search.best_estimator_,
    'RF_tuned': rf_search.best_estimator_,
    'GB_tuned': gb_search.best_estimator_,
}

tuned_results = []
for name, model in tuned_models.items():
    metrics = evaluate_model(
        model, X_val_tfidf, y_val,
        model_name=name,
        dataset_name='Validation',
        verbose=False,
        plot=False
    )
    tuned_results.append(metrics)

In [None]:
# Compare tuned models
tuned_comparison = compare_models(
    tuned_results,
    metrics_to_display=['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'pr_auc'],
    title='Tuned Model Comparison (Validation Set)',
    save_path='images/fp_tuned_comparison.png'
)

## 8. Model Selection & Final Evaluation

In [None]:
# Select best model based on PR-AUC (important for imbalanced data)
best_model_name, best_model_metrics = get_best_model(tuned_comparison, 'pr_auc')

In [None]:
# Get the best model
best_model = tuned_models[best_model_name]

# Final evaluation on validation set (with plots)
val_metrics = evaluate_model(
    best_model, X_val_tfidf, y_val,
    model_name=best_model_name,
    dataset_name='Validation',
    verbose=True,
    plot=True,
    save_path='images/fp_best_model_validation.png'
)

In [None]:
# Final evaluation on TEST set
test_metrics = evaluate_model(
    best_model, X_test_tfidf, y_test,
    model_name=best_model_name,
    dataset_name='Test',
    verbose=True,
    plot=True,
    save_path='images/fp_best_model_test.png'
)

In [None]:
# Compare validation vs test performance
generalization_check = compare_val_test_performance(val_metrics, test_metrics)

## 9. Conclusions

In [None]:
print("="*60)
print("FINAL RESULTS SUMMARY")
print("="*60)
print(f"\nBest Model: {best_model_name}")
print(f"\nTest Set Performance:")
print(f"  Accuracy:  {test_metrics['accuracy']:.4f}")
print(f"  Precision: {test_metrics['precision']:.4f}")
print(f"  Recall:    {test_metrics['recall']:.4f}")
print(f"  F1 Score:  {test_metrics['f1']:.4f}")
print(f"  ROC-AUC:   {test_metrics['roc_auc']:.4f}")
print(f"  PR-AUC:    {test_metrics['pr_auc']:.4f}")
print("="*60)

In [None]:
# Save the best model and TF-IDF vectorizer for deployment
import joblib

models_dir = project_root / 'models'
models_dir.mkdir(exist_ok=True)

# Save model
model_path = models_dir / 'fp_classifier_model.joblib'
joblib.dump(best_model, model_path)
print(f"Model saved to {model_path}")

# Save TF-IDF vectorizer
tfidf_path = models_dir / 'fp_classifier_tfidf.joblib'
joblib.dump(tfidf_pipeline, tfidf_path)
print(f"TF-IDF vectorizer saved to {tfidf_path}")

### Next Steps

1. **Deploy Model**: Integrate the classifier into the labeling pipeline to filter false positives before Claude labeling
2. **Monitor Performance**: Track precision/recall on new data to detect drift
3. **Retrain Periodically**: Update model as more labeled data becomes available
4. **Consider Deep Learning**: If more data is collected, try transformer-based models (DistilBERT)