In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Generate synthetic data (replace with real data in production)
np.random.seed(42)
n_samples = 1000

data = {
    'tenure': np.random.randint(1, 73, n_samples),
    'monthly_mins': np.random.normal(400, 150, n_samples),
    'monthly_sms': np.random.poisson(30, n_samples),
    'data_usage_gb': np.random.gamma(5, 0.5, n_samples),
    'complaints': np.random.randint(0, 6, n_samples),
    'contract_type': np.random.choice(['monthly', 'annual', 'biennial'], n_samples),
    'payment_method': np.random.choice(['credit_card', 'bank_transfer', 'electronic_check', 'mailed_check'], n_samples),
    'plan_type': np.random.choice(['basic', 'premium', 'deluxe'], n_samples),
    'churn': np.random.binomial(1, 0.2, n_samples)  # Assume 20% churn rate
}

df = pd.DataFrame(data)

# Data preprocessing
numeric_features = ['tenure', 'monthly_mins', 'monthly_sms', 'data_usage_gb', 'complaints']
categorical_features = ['contract_type', 'payment_method', 'plan_type']

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Build model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split dataset
X = df.drop('churn', axis=1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Get feature importances
feature_names = (
    numeric_features +
    list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features))
)

importances = pipeline.named_steps['classifier'].feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importance_df.head(10))

Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       167
           1       0.00      0.00      0.00        33

    accuracy                           0.83       200
   macro avg       0.42      0.50      0.46       200
weighted avg       0.70      0.83      0.76       200

Confusion Matrix:
[[167   0]
 [ 33   0]]

Feature Importances:
                            Feature  Importance
1                      monthly_mins    0.200306
3                     data_usage_gb    0.200022
0                            tenure    0.180193
2                       monthly_sms    0.150436
4                        complaints    0.082929
12                  plan_type_basic    0.020314
14                plan_type_premium    0.019862
13                 plan_type_deluxe    0.019319
10  payment_method_electronic_check    0.019101
8      payment_method_bank_transfer    0.018779


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
