# False Positive Brand Classifier - Model Selection & Tuning

This notebook performs model selection and hyperparameter tuning for the FP classifier. It loads the feature transformer created in fp1_EDA_FE.ipynb and focuses on finding the optimal model.

## Objective
Select the best classification model and tune hyperparameters to maximize F2 score (recall-weighted).

## Contents
1. [Setup](#setup)
2. [Data Loading & Split](#1-data-loading--split)
3. [Feature Transformation](#2-feature-transformation)
4. [Baseline Model Comparison](#3-baseline-model-comparison)
5. [Hyperparameter Tuning](#4-hyperparameter-tuning)
6. [Overfitting Analysis](#5-overfitting-analysis)
7. [Final Model & Threshold Selection](#6-final-model--threshold-selection)
8. [Export for Deployment](#7-export-for-deployment)

## Setup

In [None]:
# Standard imports
import json
import sys
import warnings
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Sklearn imports
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    fbeta_score, make_scorer, precision_recall_curve,
    classification_report, confusion_matrix
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Project imports
from src.fp1_nb.data_utils import load_jsonl_data, split_train_val_test
from src.fp1_nb.preprocessing import clean_text, create_text_features
from src.fp1_nb.feature_transformer import FPFeatureTransformer
from src.fp1_nb.modeling import (
    create_search_object,
    tune_with_logging,
    get_best_params_summary,
    compare_models,
    evaluate_model,
    compare_val_test_performance,
)

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-whitegrid')

# Configuration
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
TARGET_COL = 'is_sportswear'
N_FOLDS = 3

# Paths
DATA_PATH = project_root / 'data' / 'fp_training_data.jsonl'
MODELS_DIR = project_root / 'models'
IMAGES_DIR = project_root / 'images'

print("Setup complete!")

## 1. Data Loading & Split

Load data and apply identical preprocessing and split as fp1 to ensure consistency.

In [None]:
# Load the FP training data (same as fp1)
df = load_jsonl_data(DATA_PATH)

# Create combined text features (identical to fp1)
# This ensures we use the same preprocessing
df['text_features'] = create_text_features(
    df,
    text_col='content',
    title_col='title',
    brands_col='brands',
    clean_func=clean_text
)

print(f"\nText features created!")
print(f"Sample:\n{df['text_features'].iloc[0][:300]}...")

In [None]:
# Split with stratification (identical to fp1 - same random_state ensures identical splits)
train_df, val_df, test_df = split_train_val_test(
    df,
    target_col=TARGET_COL,
    train_ratio=0.6,
    val_ratio=0.2,
    test_ratio=0.2,
    random_state=RANDOM_STATE
)

## 2. Feature Transformation

Load the fitted feature transformer from fp1 and transform all splits.

In [None]:
# Load the fitted feature transformer from fp1
transformer_path = MODELS_DIR / 'fp_feature_transformer.joblib'
transformer = joblib.load(transformer_path)

# Load the transformer config
config_path = MODELS_DIR / 'fp_feature_config.json'
with open(config_path) as f:
    transformer_config = json.load(f)

print(f"Loaded transformer: {transformer}")
print(f"\nTransformer config:")
for key, value in transformer_config.items():
    print(f"  {key}: {value}")

In [None]:
# Transform all splits using the fitted transformer
X_train = transformer.transform(train_df['text_features'])
X_val = transformer.transform(val_df['text_features'])
X_test = transformer.transform(test_df['text_features'])

# Extract targets
y_train = train_df[TARGET_COL].values
y_val = val_df[TARGET_COL].values
y_test = test_df[TARGET_COL].values

# Combine train+val for hyperparameter tuning
import scipy.sparse as sp
if sp.issparse(X_train):
    X_trainval = sp.vstack([X_train, X_val])
else:
    X_trainval = np.vstack([X_train, X_val])
y_trainval = np.concatenate([y_train, y_val])

print(f"Feature dimensionality: {X_train.shape[1]}")
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_trainval shape: {X_trainval.shape} (for hyperparameter tuning)")

## 3. Baseline Model Comparison

Train and evaluate multiple classifiers on the transformed features to identify the best performing models for hyperparameter tuning.

In [None]:
# Cross-validation strategy
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
print(f"Using {N_FOLDS}-fold stratified CV")

# F2 scorer (weights recall 2x higher than precision)
f2_scorer = make_scorer(fbeta_score, beta=2)

In [None]:
# Define baseline models
baseline_models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000, 
        random_state=RANDOM_STATE,
        class_weight='balanced'
    ),
    'Naive Bayes': MultinomialNB(),
    'Linear SVM': CalibratedClassifierCV(
        LinearSVC(max_iter=2000, random_state=RANDOM_STATE, class_weight='balanced'),
        cv=3
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_jobs=-1
    ),
}

In [ ]:
# Train and evaluate baseline models
baseline_results = []

for name, model in baseline_models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    
    # Evaluate on validation set
    metrics = evaluate_model(
        model, X_val, y_val,
        model_name=name,
        dataset_name='Validation',
        verbose=True,
        plot=False
    )
    
    # Add F2 score to metrics (recall-weighted)
    y_pred = model.predict(X_val)
    metrics['f2'] = fbeta_score(y_val, y_pred, beta=2)
    print(f"  F2 Score:  {metrics['f2']:.4f} (recall-weighted)")
    
    baseline_results.append(metrics)

In [None]:
# Compare baseline models (with F2 as primary metric)
baseline_comparison = compare_models(
    baseline_results,
    metrics_to_display=['f2', 'recall', 'precision', 'f1', 'accuracy', 'pr_auc'],
    title='Baseline Model Comparison (Validation Set)',
    save_path='images/fp_baseline_comparison.png'
)

## 4. Hyperparameter Tuning

Tune the top-performing baseline models using cross-validation on train+val combined (80% of data). The test set remains completely held out for final evaluation.

### 4.1 Logistic Regression Tuning

In [None]:
# Logistic Regression parameter grid
lr_param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],
    'class_weight': ['balanced', None],
}

lr_search = create_search_object(
    search_type='grid',
    estimator=LogisticRegression(max_iter=2000, random_state=RANDOM_STATE),
    param_grid=lr_param_grid,
    cv=cv,
    refit='f2'
)

lr_search, lr_log, lr_csv = tune_with_logging(
    lr_search, X_trainval, y_trainval,
    model_name='logistic_regression'
)

### 4.2 Random Forest Tuning

In [None]:
# Random Forest parameter grid
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', 'balanced_subsample'],
}

rf_search = create_search_object(
    search_type='grid',
    estimator=RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    param_grid=rf_param_grid,
    cv=cv,
    refit='f2'
)

rf_search, rf_log, rf_csv = tune_with_logging(
    rf_search, X_trainval, y_trainval,
    model_name='random_forest'
)

### 4.3 HistGradientBoosting Tuning

In [None]:
# HistGradientBoosting parameter grid
# Note: Uses class_weight='balanced' for imbalanced data
hgb_param_grid = {
    'max_iter': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, None],
    'min_samples_leaf': [5, 20],
    'l2_regularization': [0.0, 0.1],
    'class_weight': ['balanced'],
}

hgb_search = create_search_object(
    search_type='grid',
    estimator=HistGradientBoostingClassifier(random_state=RANDOM_STATE),
    param_grid=hgb_param_grid,
    cv=cv,
    refit='f2'
)

hgb_search, hgb_log, hgb_csv = tune_with_logging(
    hgb_search, X_trainval, y_trainval,
    model_name='hist_gradient_boosting'
)

### 4.4 Compare Tuned Models (CV Performance)

Compare models based on their cross-validation F2 scores. Model selection is based on CV performance, not test set performance.

In [None]:
# Compare tuned models based on CV performance
tuned_models = {
    'LR_tuned': lr_search,
    'RF_tuned': rf_search,
    'HGB_tuned': hgb_search,
}

# Extract CV metrics for comparison
cv_comparison_data = []
for name, search in tuned_models.items():
    best_idx = search.best_index_
    cv_results = search.cv_results_
    
    metrics = {
        'model_name': name,
        'f2': cv_results['mean_test_f2'][best_idx],
        'recall': cv_results['mean_test_recall'][best_idx],
        'precision': cv_results['mean_test_precision'][best_idx],
        'f1': cv_results['mean_test_f1'][best_idx],
        'accuracy': cv_results['mean_test_accuracy'][best_idx],
        'pr_auc': cv_results['mean_test_average_precision'][best_idx],
    }
    cv_comparison_data.append(metrics)
    
    print(f"{name}: CV F2 = {metrics['f2']:.4f} (+/- {cv_results['std_test_f2'][best_idx]:.4f}), Recall = {metrics['recall']:.4f}, Precision = {metrics['precision']:.4f}")

In [None]:
# Compare tuned models (F2 as primary metric)
tuned_comparison = compare_models(
    cv_comparison_data,
    metrics_to_display=['f2', 'recall', 'precision', 'f1', 'accuracy', 'pr_auc'],
    title='Tuned Model Comparison (CV Performance, Optimized for F2)',
    save_path='images/fp_tuned_comparison.png'
)

## 5. Overfitting Analysis

Analyze the gap between CV performance and test performance to assess model generalization.

In [None]:
# Evaluate all tuned models on test set and compare with CV performance
print("=" * 70)
print("OVERFITTING ANALYSIS: CV vs Test Performance")
print("=" * 70)

overfitting_data = []

for name, search in tuned_models.items():
    best_model = search.best_estimator_
    best_idx = search.best_index_
    cv_results = search.cv_results_
    
    # Get CV metrics
    cv_f2 = cv_results['mean_test_f2'][best_idx]
    cv_f2_std = cv_results['std_test_f2'][best_idx]
    cv_recall = cv_results['mean_test_recall'][best_idx]
    cv_precision = cv_results['mean_test_precision'][best_idx]
    
    # Get test metrics
    y_pred_test = best_model.predict(X_test)
    test_f2 = fbeta_score(y_test, y_pred_test, beta=2)
    test_recall = (y_pred_test[y_test == 1] == 1).mean()
    test_precision = (y_test[y_pred_test == 1] == 1).mean() if y_pred_test.sum() > 0 else 0
    
    # Calculate gaps
    f2_gap = test_f2 - cv_f2
    recall_gap = test_recall - cv_recall
    precision_gap = test_precision - cv_precision
    
    print(f"\n{name}:")
    print(f"  CV F2:       {cv_f2:.4f} (+/- {cv_f2_std:.4f})")
    print(f"  Test F2:     {test_f2:.4f}")
    print(f"  Gap:         {f2_gap:+.4f} {'⚠️ OVERFITTING' if f2_gap < -0.05 else '✓ OK'}")
    print(f"  CV Recall:   {cv_recall:.4f} | Test Recall:   {test_recall:.4f} | Gap: {recall_gap:+.4f}")
    print(f"  CV Precision:{cv_precision:.4f} | Test Precision:{test_precision:.4f} | Gap: {precision_gap:+.4f}")
    
    overfitting_data.append({
        'model': name,
        'cv_f2': cv_f2,
        'cv_f2_std': cv_f2_std,
        'test_f2': test_f2,
        'f2_gap': f2_gap,
        'cv_recall': cv_recall,
        'test_recall': test_recall,
        'cv_precision': cv_precision,
        'test_precision': test_precision,
    })

print("\n" + "=" * 70)

In [None]:
# Visualize CV vs Test performance
overfitting_df = pd.DataFrame(overfitting_data)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# F2 comparison
ax1 = axes[0]
x = np.arange(len(overfitting_df))
width = 0.35
bars1 = ax1.bar(x - width/2, overfitting_df['cv_f2'], width, label='CV F2', color='steelblue')
bars2 = ax1.bar(x + width/2, overfitting_df['test_f2'], width, label='Test F2', color='coral')
ax1.errorbar(x - width/2, overfitting_df['cv_f2'], yerr=overfitting_df['cv_f2_std'], fmt='none', color='black', capsize=3)
ax1.set_xlabel('Model')
ax1.set_ylabel('F2 Score')
ax1.set_title('CV vs Test F2 Score (Overfitting Check)')
ax1.set_xticks(x)
ax1.set_xticklabels(overfitting_df['model'])
ax1.legend()
ax1.set_ylim([0.9, 1.0])
ax1.axhline(y=0.95, color='gray', linestyle='--', alpha=0.5, label='Threshold')

# Gap visualization
ax2 = axes[1]
colors = ['green' if gap >= -0.05 else 'red' for gap in overfitting_df['f2_gap']]
bars = ax2.bar(overfitting_df['model'], overfitting_df['f2_gap'], color=colors)
ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax2.axhline(y=-0.05, color='red', linestyle='--', alpha=0.5, label='Overfitting threshold')
ax2.set_xlabel('Model')
ax2.set_ylabel('F2 Gap (Test - CV)')
ax2.set_title('F2 Gap Analysis')
ax2.legend()

plt.tight_layout()
plt.savefig('images/fp_overfitting_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Final Model & Threshold Selection

Select the best model based on CV F2 score and evaluate on the held-out test set.

In [ ]:
# Select best model based on CV F2
best_model_name, best_model_metrics = get_best_model(tuned_comparison, 'f2')
print(f"Selected model: {best_model_name}")
print(f"CV F2: {best_model_metrics['f2']:.4f} (primary metric)")
print(f"CV Recall: {best_model_metrics['recall']:.4f}")
print(f"CV Precision: {best_model_metrics['precision']:.4f}")

In [None]:
# Get the best model and evaluate on test set
best_search = tuned_models[best_model_name]
best_model = best_search.best_estimator_

# Evaluate on held-out test set (only once)
test_metrics = evaluate_model(
    best_model, X_test, y_test,
    model_name=best_model_name,
    dataset_name='Test',
    verbose=True,
    plot=True,
    save_path='images/fp_best_model_test.png'
)

# Add F2 score
y_pred_test = best_model.predict(X_test)
test_metrics['f2'] = fbeta_score(y_test, y_pred_test, beta=2)
print(f"  F2 Score:  {test_metrics['f2']:.4f} (recall-weighted)")

### 6.1 Threshold Tuning

Adjust the decision threshold to optimize recall at the cost of precision.

In [None]:
# Analyze precision-recall trade-off at different thresholds
y_proba = best_model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

# Find thresholds for different target recall levels
target_recalls = [0.95, 0.97, 0.98, 0.99]

print("=" * 70)
print("THRESHOLD ANALYSIS: Recall vs Precision Trade-off")
print("=" * 70)
print("\nTarget Recall | Threshold | Actual Recall | Precision | FPs Passed")
print("-" * 70)

for target in target_recalls:
    idx = np.where(recalls >= target)[0]
    if len(idx) > 0:
        best_idx = idx[-1]
        threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.0
        actual_recall = recalls[best_idx]
        precision = precisions[best_idx]
        
        y_pred_custom = (y_proba >= threshold).astype(int)
        fp_count = ((y_pred_custom == 1) & (y_test == 0)).sum()
        total_fp = (y_test == 0).sum()
        
        print(f"    {target:.0%}      |   {threshold:.3f}   |    {actual_recall:.1%}      |   {precision:.1%}    |  {fp_count}/{total_fp}")

print("-" * 70)
print("\nLower threshold = Higher recall but more FPs pass to LLM")
print("=" * 70)

In [ ]:
# Plot precision-recall curve with threshold markers
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(recalls, precisions, 'b-', linewidth=2, label='Precision-Recall curve')

# Mark key thresholds
for target in [0.95, 0.98]:
    idx = np.where(recalls >= target)[0]
    if len(idx) > 0:
        best_idx = idx[-1]
        ax.scatter(recalls[best_idx], precisions[best_idx], s=100, zorder=5,
                  label=f'Recall >= {target:.0%} (threshold={thresholds[best_idx]:.2f})')

# Mark default threshold (0.5)
default_idx = np.argmin(np.abs(thresholds - 0.5))
ax.scatter(recalls[default_idx], precisions[default_idx], s=100, c='red', marker='x', zorder=5,
          label=f'Default (threshold=0.5)')

ax.set_xlabel('Recall', fontsize=12)
ax.set_ylabel('Precision', fontsize=12)
ax.set_title('Precision-Recall Trade-off: Choose Threshold to Maximize Recall', fontsize=14)
ax.legend(loc='lower left')
ax.grid(True, alpha=0.3)
ax.set_xlim([0.8, 1.01])
ax.set_ylim([0.5, 1.01])

plt.tight_layout()
plt.savefig('images/fp_threshold_tradeoff.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Set optimal threshold for deployment (targeting 98% recall)
TARGET_RECALL = 0.98
idx = np.where(recalls >= TARGET_RECALL)[0]
OPTIMAL_THRESHOLD = thresholds[idx[-1]] if len(idx) > 0 else 0.5

print("=" * 60)
print("OPTIMAL THRESHOLD FOR DEPLOYMENT")
print("=" * 60)
print(f"\nTarget recall: {TARGET_RECALL:.0%}")
print(f"Optimal threshold: {OPTIMAL_THRESHOLD:.4f}")

# Evaluate with optimal threshold
y_pred_optimal = (y_proba >= OPTIMAL_THRESHOLD).astype(int)
optimal_recall = (y_pred_optimal[y_test == 1] == 1).mean()
optimal_precision = (y_test[y_pred_optimal == 1] == 1).mean()
fp_passed = ((y_pred_optimal == 1) & (y_test == 0)).sum()
fn_missed = ((y_pred_optimal == 0) & (y_test == 1)).sum()

print(f"\nWith optimal threshold on test set:")
print(f"  Recall:    {optimal_recall:.4f} ({fn_missed} sportswear articles missed)")
print(f"  Precision: {optimal_precision:.4f} ({fp_passed} false positives passed to LLM)")
print("=" * 60)

## 7. Export for Deployment

Save the complete pipeline (transformer + classifier) for Docker API deployment.

In [None]:
# Create complete pipeline for deployment
full_pipeline = Pipeline([
    ('features', transformer),
    ('classifier', best_model)
])

# Save complete pipeline
pipeline_path = MODELS_DIR / 'fp_classifier_pipeline.joblib'
joblib.dump(full_pipeline, pipeline_path)
print(f"Complete pipeline saved to {pipeline_path}")

# Save configuration
classifier_config = {
    'threshold': float(OPTIMAL_THRESHOLD),
    'target_recall': TARGET_RECALL,
    'transformer_method': transformer.method,
    'classifier_type': type(best_model).__name__,
    'cv_f2': float(best_model_metrics['f2']),
    'test_f2': float(test_metrics['f2']),
    'best_params': best_search.best_params_,
}

config_path = MODELS_DIR / 'fp_classifier_config.json'
with open(config_path, 'w') as f:
    json.dump(classifier_config, f, indent=2)
print(f"Configuration saved to {config_path}")

In [None]:
# Test the pipeline
print("Testing complete pipeline...")
test_texts = [
    "Nike announces new sustainability initiative to reduce carbon emissions",
    "Puma the wild cat was spotted in the mountains of Montana"
]

# Test predict
predictions = full_pipeline.predict(test_texts)
probabilities = full_pipeline.predict_proba(test_texts)[:, 1]

print("\nPipeline test results:")
for text, pred, prob in zip(test_texts, predictions, probabilities):
    label = "Sportswear" if pred == 1 else "False Positive"
    print(f"  [{prob:.4f}] {label}: {text[:60]}...")

In [ ]:
print("=" * 60)
print("FINAL RESULTS SUMMARY")
print("=" * 60)
print(f"\nBest Model: {best_model_name}")
print(f"Feature Method: {transformer.method}")
print(f"\nCV Performance:")
print(f"  F2 Score:  {best_model_metrics['f2']:.4f}")
print(f"  Recall:    {best_model_metrics['recall']:.4f}")
print(f"  Precision: {best_model_metrics['precision']:.4f}")
print(f"\nTest Set Performance:")
print(f"  F2 Score:  {test_metrics['f2']:.4f}")
print(f"  Recall:    {test_metrics['recall']:.4f}")
print(f"  Precision: {test_metrics['precision']:.4f}")
print(f"  F1 Score:  {test_metrics['f1']:.4f}")
print(f"  Accuracy:  {test_metrics['accuracy']:.4f}")
print(f"  PR-AUC:    {test_metrics['pr_auc']:.4f}")
print(f"\nDeployment Threshold: {OPTIMAL_THRESHOLD:.4f}")
print(f"\nSaved Artifacts:")
print(f"  - {pipeline_path}")
print(f"  - {config_path}")
print("=" * 60)

## Next Steps

1. **Deploy Model**: Integrate `fp_classifier_pipeline.joblib` into Docker API service
2. **Monitor Performance**: Track F2/recall on new data to detect drift
3. **Retrain Periodically**: Update model as more labeled data becomes available
4. **Alternative FE Methods**: If performance degrades, explore sentence-transformers or doc2vec in fp1