# Production Model Training

This notebook trains the final Random Forest model on the complete dataset and saves the production artifacts:
- Random Forest model (best configuration from study)
- Imputer model for handling missing values
- Feature preprocessing pipeline

In [2]:
from pathlib import Path
import pickle
import optuna

import polars as pl
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [3]:
# Setup paths
base_dir = Path('/Users/danlab/code/magenta-task/')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
test_dir = data_dir / 'test'
artifacts_dir = data_dir / "models/artifacts"
db_dir = 'sqlite:///data/models/{}.db'

# Ensure artifacts directory exists
artifacts_dir.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {base_dir}")
print(f"Artifacts directory: {artifacts_dir}")

Base directory: /Users/danlab/code/magenta-task
Artifacts directory: /Users/danlab/code/magenta-task/notebooks/data/models/artifacts


## Load Complete Dataset

In [5]:
cleaned_features_dataset = pl.read_parquet(features_dir / 'features_cleaned-v0.parquet')

print(f"Full dataset shape: {cleaned_features_dataset.shape}")

Full dataset shape: (100000, 83)


## Prepare Data for Training

In [6]:
# Prepare features and target
X_full = cleaned_features_dataset.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_full = cleaned_features_dataset.select('has_done_upselling')

## Load Random Forest Configuration

In [7]:
# Load Random Forest study (base F1 optimization)
print("Loading Random Forest study...")
rf_study = optuna.load_study(
    study_name="random_forest_optimization_basef1", 
    storage=db_dir.format('rf_study')
)

# Get best parameters
best_rf_params = rf_study.best_params
print(f"Best Random Forest parameters:")
for param, value in best_rf_params.items():
    print(f"  {param}: {value}")

print(f"Best trial value (F1): {rf_study.best_value}")

Loading Random Forest study...
Best Random Forest parameters:
  n_estimators: 864
  max_depth: 19
  min_samples_split: 19
  min_samples_leaf: 1
  min_weight_fraction_leaf: 0.015327799848051129
  max_features: sqrt
  max_samples: 0.2900931234940456
  max_leaf_nodes: 154
  class_weight: balanced
Best trial value (F1): 0.1669563556238734


## Train Final Production Model

In [8]:
# Convert to numpy for sklearn
X_np = X_full.to_numpy()
y_np = y_full.to_numpy().ravel()

In [9]:
# Create and train final Random Forest model
print("Training final Random Forest model on complete dataset...")

final_rf_model = RandomForestClassifier(
    **best_rf_params,
    random_state=42,  # Ensure reproducibility
    n_jobs=-1  # Use all available cores
)

# Train on full dataset
final_rf_model.fit(X_np, y_np)

print("Random Forest model training completed!")
print(f"Model parameters: {final_rf_model.get_params()}")

Training final Random Forest model on complete dataset...
Random Forest model training completed!
Model parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 19, 'max_features': 'sqrt', 'max_leaf_nodes': 154, 'max_samples': 0.2900931234940456, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 19, 'min_weight_fraction_leaf': 0.015327799848051129, 'monotonic_cst': None, 'n_estimators': 864, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


## Feature Importance Analysis

In [10]:
feature_names = X_full.columns
feature_importance = final_rf_model.feature_importances_

In [13]:
# Get feature importance from final model
feature_names = X_full.columns
feature_importance = final_rf_model.feature_importances_

# Create importance DataFrame
importance_df = pl.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort('importance', descending=True)

print("Top 15 Most Important Features:")
print("=" * 50)
print(importance_df.head(15))

# Save feature importance
importance_path = artifacts_dir / 'feature_importance.csv'
importance_df.write_csv(importance_path)
print(f"\nFeature importance saved to: {importance_path}")

Top 15 Most Important Features:
shape: (15, 2)
┌────────────────────────┬──────────────────────┐
│ feature                ┆ importance           │
│ ---                    ┆ ---                  │
│ str                    ┆ f64                  │
╞════════════════════════╪══════════════════════╡
│ age                    ┆ 0.1459544648628623   │
│ available_gb           ┆ 0.10211186347688483  │
│ contract_lifetime_days ┆ 0.06337544629952309  │
│ gross_mrc              ┆ 0.04327485505187612  │
│ remaining_binding_days ┆ 0.03896570749604652  │
│ …                      ┆ …                    │
│ usage_std_gb           ┆ 0.013830440502246218 │
│ last_3_delta_1mo       ┆ 0.013781929854207292 │
│ last_1_delta_2mo       ┆ 0.013034944873259227 │
│ last_1_2mo_rolling_avg ┆ 0.012983024967219087 │
│ avg_monthly_usage_gb   ┆ 0.012603837542984537 │
└────────────────────────┴──────────────────────┘

Feature importance saved to: /Users/danlab/code/magenta-task/notebooks/data/models/artifacts/feature_i

## Save Production Artifacts

In [14]:
# Save only the trained Random Forest model
model_path = artifacts_dir / 'initial_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(final_rf_model, f)

print(f"Random Forest model saved to: {model_path}")


Random Forest model saved to: /Users/danlab/code/magenta-task/notebooks/data/models/artifacts/initial_model.pkl


## Model Loading Test

In [19]:
# Test loading the saved model
print("Testing model loading...")

# Load production model
with open(model_path, 'rb') as f:
    loaded_model = pickle.load(f)

# Test prediction on a small sample
test_sample = X_np[:5]
test_predictions = loaded_model.predict_proba(test_sample)[:, 1]
test_binary = (test_predictions > 0.5).astype(int)  # Default threshold 0.5

print("\nModel Loading Test Results:")
print(f"Loaded model type: {type(loaded_model)}")
print(f"Test predictions (probabilities): {test_predictions}")
print(f"Test predictions (binary): {test_binary}")

Testing model loading...

Model Loading Test Results:
Loaded model type: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Test predictions (probabilities): [0.51371808 0.54590508 0.48215524 0.51721858 0.3941271 ]
Test predictions (binary): [1 1 0 1 0]
