
# EkoPower Churn Prediction

This notebook builds a churn prediction model for EkoPower SME customers, and mirrors a pragmatic selection process:

1. **Load & Clean Data** (`client_data.csv`, `price_data.csv`)  
2. **Feature Engineering** (dates → tenure, days to renewal; pricing aggregates)  
3. **Multicollinearity Check (VIF)**  
4. **Train/Test Split + Scaling**  
5. **Benchmark 3 Models**: RandomForest, LightGBM, XGBoost (time + classification report)  
6. **Threshold Sweep** for top-2 by recall  
7. **Choose model to optimize** (near-best recall, fastest training)  
8. **Optuna Tuning** (maximize recall for churners) + **MLflow logging**  


## 1) Imports

In [266]:
# Data handling
import pandas as pd
import numpy as np

# Ignore warnings (keeps output clean)
import warnings
warnings.filterwarnings('ignore')

# Timing how long things take
import time

# Plotting (for visuals later)
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning tools
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score


# Three powerful ML models
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Check for redundant features
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [267]:
# Load client info and pricing history
client_df = pd.read_csv("..\data\client_data.csv")
price_df = pd.read_csv("..\data\price_data.csv")

In [268]:
# Convert date columns to real dates
date_columns = ['date_activ', 'date_end', 'date_modif_prod', 'date_renewal']
for col in date_columns:
    client_df[col] = pd.to_datetime(client_df[col], errors='coerce')

In [135]:
client_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14606 entries, 0 to 14605
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   channel_sales                   14606 non-null  object 
 1   cons_12m                        14606 non-null  float64
 2   cons_gas_12m                    14606 non-null  float64
 3   cons_last_month                 14606 non-null  float64
 4   forecast_cons_12m               14606 non-null  float64
 5   forecast_cons_year              14606 non-null  float64
 6   forecast_discount_energy        14606 non-null  float64
 7   forecast_meter_rent_12m         14606 non-null  float64
 8   forecast_price_energy_off_peak  14606 non-null  float64
 9   forecast_price_energy_peak      14606 non-null  float64
 10  forecast_price_pow_off_peak     14606 non-null  float64
 11  has_gas                         14606 non-null  object 
 12  imp_cons                        

In [227]:
client_df.isnull().sum()

id                                0
channel_sales                     0
cons_12m                          0
cons_gas_12m                      0
cons_last_month                   0
date_activ                        0
date_end                          0
date_modif_prod                   0
date_renewal                      0
forecast_cons_12m                 0
forecast_cons_year                0
forecast_discount_energy          0
forecast_meter_rent_12m           0
forecast_price_energy_off_peak    0
forecast_price_energy_peak        0
forecast_price_pow_off_peak       0
has_gas                           0
imp_cons                          0
margin_gross_pow_ele              0
margin_net_pow_ele                0
nb_prod_act                       0
net_margin                        0
num_years_antig                   0
origin_up                         0
pow_max                           0
churn                             0
dtype: int64

### Featuring Engineering

In [228]:
# What's today?
today = pd.Timestamp('today').normalize()

# How long has the client been with us? (in days)
client_df['tenure_days'] = (today - client_df['date_activ']).dt.days

# How many days until their contract renews? (negative = past due)
client_df['days_to_renewal'] = (client_df['date_renewal'] - today).dt.days

# Is their contract still active?
client_df['contract_active'] = (client_df['date_end'] > today).astype(int)

# Total contracted duration (in years, as integer)
client_df['contracted_tenure_years'] = (
    (client_df['date_end'] - client_df['date_activ']).dt.days / 365.25
).round().astype('Int64')  # Use nullable integer to handle NaT

In [229]:
price_vars = [
    'price_off_peak_var', 'price_peak_var', 'price_mid_peak_var',
    'price_off_peak_fix', 'price_peak_fix', 'price_mid_peak_fix'
]

price_agg_base = price_df.groupby('id')[price_vars].agg(['mean', 'std'])
price_agg_base.columns = ['_'.join(col) for col in price_agg_base.columns]
price_agg_base = price_agg_base.reset_index()

In [190]:
price_agg_base.head()

Unnamed: 0,id,price_off_peak_var_mean,price_off_peak_var_std,price_peak_var_mean,price_peak_var_std,price_mid_peak_var_mean,price_mid_peak_var_std,price_off_peak_fix_mean,price_off_peak_fix_std,price_peak_fix_mean,price_peak_fix_std,price_mid_peak_fix_mean,price_mid_peak_fix_std
0,0002203ffbb812588b632b9e628cc38d,0.124338,0.003976,0.103794,0.001989,0.07316,0.001368,40.701732,0.06341481,24.421038,0.038049,16.280694,0.025366
1,0004351ebdd665e6ee664792efc4fd13,0.146426,0.002197,0.0,0.0,0.0,0.0,44.38545,0.08753223,0.0,0.0,0.0,0.0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.181558,0.026008,0.0,0.0,0.0,0.0,45.31971,0.772393,0.0,0.0,0.0,0.0
3,0010ee3855fdea87602a5b7aba8e42de,0.118757,0.005049,0.098292,0.00258,0.069032,0.000403,40.647427,0.08507958,24.388455,0.051048,16.258971,0.034032
4,00114d74e963e47177db89bc70108537,0.147926,0.002202,0.0,0.0,0.0,0.0,44.26693,5.908392e-07,0.0,0.0,0.0,0.0


In [230]:
# Merge into client data
df = pd.merge(client_df, price_agg_base, on='id', how='left')

In [231]:
print(df.columns.tolist())


['id', 'channel_sales', 'cons_12m', 'cons_gas_12m', 'cons_last_month', 'date_activ', 'date_end', 'date_modif_prod', 'date_renewal', 'forecast_cons_12m', 'forecast_cons_year', 'forecast_discount_energy', 'forecast_meter_rent_12m', 'forecast_price_energy_off_peak', 'forecast_price_energy_peak', 'forecast_price_pow_off_peak', 'has_gas', 'imp_cons', 'margin_gross_pow_ele', 'margin_net_pow_ele', 'nb_prod_act', 'net_margin', 'num_years_antig', 'origin_up', 'pow_max', 'churn', 'tenure_days', 'days_to_renewal', 'contract_active', 'contracted_tenure_years', 'price_off_peak_var_mean', 'price_off_peak_var_std', 'price_peak_var_mean', 'price_peak_var_std', 'price_mid_peak_var_mean', 'price_mid_peak_var_std', 'price_off_peak_fix_mean', 'price_off_peak_fix_std', 'price_peak_fix_mean', 'price_peak_fix_std', 'price_mid_peak_fix_mean', 'price_mid_peak_fix_std']


### Encoding

In [232]:
df['has_gas'] = df['has_gas'].map({'t': 1, 'f': 0})

categorical_columns = ['channel_sales', 'origin_up']

# Keep only those that actually exist in the dataset
categorical_columns = [col for col in categorical_columns if col in df.columns]
print("Categorical columns:", categorical_columns)

for col in categorical_columns:
    df[col] = df[col].astype('category')


Categorical columns: ['channel_sales', 'origin_up']


In [194]:
## Check how many unique values each column has

for col in categorical_columns:
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts(dropna=False))


Value counts for channel_sales:
channel_sales
foosdfpfkusacimwkcsosbicdxkicaua    6754
MISSING                             3725
lmkebamcaaclubfxadlmueccxoimlema    1843
usilxuppasemubllopkaafesmlibmsdf    1375
ewpakwlliwisiwduibdlfmalxowmwpci     893
sddiedcslfslkckwlfkdpoeeailfpeds      11
epumfxlbckeskwekxbiuasklxalciiuu       3
fixdbufsefwooaasfcxdxadsiekoceaa       2
Name: count, dtype: int64

Value counts for origin_up:
origin_up
lxidpiddsbxsbosboudacockeimpuepw    7097
kamkkxfxxuwbdslkwifmmcsiusiuosws    4294
ldkssxwpmemidmecebumciepifcamkci    3148
MISSING                               64
usapbepcfoloekilkwsdiboslwaxobdp       2
ewxeelcelemmiwuafmddpobolfuxioce       1
Name: count, dtype: int64


We have 8 categories, so we will create 8 dummy variables from this column. However, as you can see the last 3 categories in the output above, show that they only have 11, 3 and 2 occurrences respectively. Considering that our dataset has about 14000 rows, this means that these dummy variables will be almost entirely 0 and so will not add much predictive power to the model at all (since they're almost entirely a constant value and provide very little).

For this reason, we will drop these 3 dummy variables.

Similar to `channel_sales` the last 3 categories in the output above show very low frequency, so we will remove these from the features after creating dummy variables.

In [233]:
df = pd.get_dummies(df, columns=['origin_up'], prefix='origin_up')
df = df.drop(columns=['origin_up_MISSING', 'origin_up_usapbepcfoloekilkwsdiboslwaxobdp', 'origin_up_ewxeelcelemmiwuafmddpobolfuxioce'])
df.head()

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,...,price_mid_peak_var_std,price_off_peak_fix_mean,price_off_peak_fix_std,price_peak_fix_mean,price_peak_fix_std,price_mid_peak_fix_mean,price_mid_peak_fix_std,origin_up_kamkkxfxxuwbdslkwifmmcsiusiuosws,origin_up_ldkssxwpmemidmecebumciepifcamkci,origin_up_lxidpiddsbxsbosboudacockeimpuepw
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,...,0.020983,40.942265,1.050136,22.35201,7.039226,14.90134,4.692817,False,False,True
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,...,0.0,44.311375,0.080404,0.0,0.0,0.0,0.0,True,False,False
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,...,0.0,44.38545,0.087532,0.0,0.0,0.0,0.0,True,False,False
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,2010-03-30,2016-03-30,2010-03-30,2015-03-31,240.04,...,0.0,44.400265,0.080403,0.0,0.0,0.0,0.0,True,False,False
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,2010-01-13,2016-03-07,2010-01-13,2015-03-09,445.75,...,0.001588,40.688156,0.073681,24.412893,0.044209,16.275263,0.029473,True,False,False


In [234]:
df = pd.get_dummies(df, columns=['channel_sales'], prefix='channel')
df = df.drop(columns=['channel_sddiedcslfslkckwlfkdpoeeailfpeds', 'channel_epumfxlbckeskwekxbiuasklxalciiuu', 'channel_fixdbufsefwooaasfcxdxadsiekoceaa'])
df.head()

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,date_activ,date_end,date_modif_prod,date_renewal,forecast_cons_12m,forecast_cons_year,...,price_mid_peak_fix_mean,price_mid_peak_fix_std,origin_up_kamkkxfxxuwbdslkwifmmcsiusiuosws,origin_up_ldkssxwpmemidmecebumciepifcamkci,origin_up_lxidpiddsbxsbosboudacockeimpuepw,channel_MISSING,channel_ewpakwlliwisiwduibdlfmalxowmwpci,channel_foosdfpfkusacimwkcsosbicdxkicaua,channel_lmkebamcaaclubfxadlmueccxoimlema,channel_usilxuppasemubllopkaafesmlibmsdf
0,24011ae4ebbe3035111d65fa7c15bc57,0,54946,0,2013-06-15,2016-06-15,2015-11-01,2015-06-23,0.0,0,...,14.90134,4.692817,False,False,True,False,False,True,False,False
1,d29c2c54acc38ff3c0614d0a653813dd,4660,0,0,2009-08-21,2016-08-30,2009-08-21,2015-08-31,189.95,0,...,0.0,0.0,True,False,False,True,False,False,False,False
2,764c75f661154dac3a6c254cd082ea7d,544,0,0,2010-04-16,2016-04-16,2010-04-16,2015-04-17,47.96,0,...,0.0,0.0,True,False,False,False,False,True,False,False
3,bba03439a292a1e166f80264c16191cb,1584,0,0,2010-03-30,2016-03-30,2010-03-30,2015-03-31,240.04,0,...,0.0,0.0,True,False,False,False,False,False,True,False
4,149d57cf92fc41cf94415803a877cb4b,4425,0,526,2010-01-13,2016-03-07,2010-01-13,2015-03-09,445.75,526,...,16.275263,0.029473,True,False,False,True,False,False,False,False


### Churn

We have to ensure that churn is not float or boolean.

In [235]:
df['churn'] = pd.to_numeric(df['churn'], errors='coerce').fillna(0).astype('int64')

In [236]:
# Drop ID and raw date columns (they can cause data leakage or confusion)
cols_to_drop = ['id', 'date_activ', 'date_end', 'date_modif_prod', 'date_renewal']
cols_to_drop = [col for col in cols_to_drop if col in df.columns]
df.drop(columns=cols_to_drop, inplace=True)

In [237]:
# Fill missing numbers with the median (middle value) of each column
df = df.fillna(df.median(numeric_only=True))

In [238]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14606 entries, 0 to 14605
Data columns (total 43 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   cons_12m                                    14606 non-null  int64  
 1   cons_gas_12m                                14606 non-null  int64  
 2   cons_last_month                             14606 non-null  int64  
 3   forecast_cons_12m                           14606 non-null  float64
 4   forecast_cons_year                          14606 non-null  int64  
 5   forecast_discount_energy                    14606 non-null  float64
 6   forecast_meter_rent_12m                     14606 non-null  float64
 7   forecast_price_energy_off_peak              14606 non-null  float64
 8   forecast_price_energy_peak                  14606 non-null  float64
 9   forecast_price_pow_off_peak                 14606 non-null  float64
 10  has_gas   

In [239]:
df.isna().sum()

cons_12m                                      0
cons_gas_12m                                  0
cons_last_month                               0
forecast_cons_12m                             0
forecast_cons_year                            0
forecast_discount_energy                      0
forecast_meter_rent_12m                       0
forecast_price_energy_off_peak                0
forecast_price_energy_peak                    0
forecast_price_pow_off_peak                   0
has_gas                                       0
imp_cons                                      0
margin_gross_pow_ele                          0
margin_net_pow_ele                            0
nb_prod_act                                   0
net_margin                                    0
num_years_antig                               0
pow_max                                       0
churn                                         0
tenure_days                                   0
days_to_renewal                         

In [240]:
# X = everything except 'churn'
# y = the 'churn' column (what we want to predict)
X = df.drop('churn', axis=1)
y = df['churn']


In [241]:
# converts int64, bool, Int64 → float64
X = X.astype('float64')

### Multicollinearity (VIF)

In [252]:
print("\nComputing VIF (variance inflation factor)...")

X_vif = X.copy()

# Convert all columns to numeric, force to float64
X_vif = X_vif.apply(lambda col: pd.to_numeric(col, errors='coerce')).astype('float64')

# Fill any NaNs (critical — VIF can't handle NaN)
X_vif = X_vif.fillna(X_vif.median())

# Double-check: no inf or extreme values
X_vif = X_vif.replace([np.inf, -np.inf], np.nan)
X_vif = X_vif.fillna(X_vif.median())

# compute VIF
try:
    vif_data = pd.DataFrame({
        'feature': X_vif.columns,
        'VIF': [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
    }).sort_values('VIF', ascending=False)
    
    print("\nTop 20 VIF scores (highest first):\n", vif_data.head(20))
    
except Exception as e:
    print("❌ VIF failed. Inspecting data types:")
    print(X_vif.dtypes.value_counts())
    print("\nSample of problematic columns:")
    for col in X_vif.columns:
        if not np.issubdtype(X_vif[col].dtype, np.number):
            print(f" - {col}: {X_vif[col].dtype}")
    raise e


Computing VIF (variance inflation factor)...

Top 20 VIF scores (highest first):
                                        feature           VIF
12                        margin_gross_pow_ele  14491.968408
13                          margin_net_pow_ele  14483.617145
18                                 tenure_days   9001.280113
19                             days_to_renewal   3149.531245
28                     price_off_peak_fix_mean   2167.665560
9                  forecast_price_pow_off_peak   1858.670995
22                     price_off_peak_var_mean   1470.628298
7               forecast_price_energy_off_peak   1108.080158
21                     contracted_tenure_years   1067.655100
39    channel_foosdfpfkusacimwkcsosbicdxkicaua    358.079568
26                     price_mid_peak_var_mean    266.148899
30                         price_peak_fix_mean    265.392933
24                         price_peak_var_mean    256.219775
16                             num_years_antig    225.527355
8 

### Train/Test Split + Scaling

In [254]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Only transform test data

THRESHOLD = 0.3  # lower than 0.5 to boost recall (see next to choose the right value)

### Random Forest

In [255]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(
    n_estimators=300,
    class_weight='balanced',   # handles imbalance for you
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

proba = rf.predict_proba(X_test)[:, 1]
y_pred = (proba >= THRESHOLD).astype(int)

print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.916     0.986     0.950      2638
           1      0.554     0.162     0.251       284

    accuracy                          0.906      2922
   macro avg      0.735     0.574     0.600      2922
weighted avg      0.881     0.906     0.882      2922



The Business Reality is that churn Is Naturally Imbalanced because churn is supposed to be rare.

Artificially balancing it changes the base rate — so predicted probabilities no longer reflect the true likelihood of churn.

For example, if you force a 50/50 balance, a predicted “0.7” churn probability doesn’t mean 70% chance anymore — it’s misleading.

The solution always is to: Train on imbalanced data, but adjust class weights, thresholds, or evaluation metrics.

In [256]:
from sklearn.metrics import precision_score, recall_score, f1_score

proba = rf.predict_proba(X_test)[:, 1]

print("Threshold tuning for RandomForest")

print(f"{'Thresh':<8}{'Prec_1':<8}{'Rec_1':<8}{'F1_1':<8}")
for thresh in [0.25, 0.30, 0.35, 0.40, 0.45, 0.50]:
    preds = (proba >= thresh).astype(int)
    prec = precision_score(y_test, preds, pos_label=1)
    rec = recall_score(y_test, preds, pos_label=1)
    f1 = f1_score(y_test, preds, pos_label=1)
    print(f"{thresh:<8}{prec:<8.3f}{rec:<8.3f}{f1:<8.3f}")

Threshold tuning for RandomForest
Thresh  Prec_1  Rec_1   F1_1    
0.25    0.456   0.236   0.311   
0.3     0.554   0.162   0.251   
0.35    0.673   0.123   0.208   
0.4     0.750   0.106   0.185   
0.45    0.867   0.092   0.166   
0.5     0.880   0.077   0.142   


#### XGBoost

In [270]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import time

# Calculate scale_pos_weight for imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss'
)

# Training timer
start_train = time.time()
xgb.fit(X_train, y_train)
train_time = time.time() - start_train
print(f"⏱ Training time: {train_time:.2f} seconds")

# Prediction timer
start_pred = time.time()
proba = xgb.predict_proba(X_test)[:, 1]
y_pred = (proba >= THRESHOLD).astype(int)
pred_time = time.time() - start_pred
print(f"⏱ Prediction time: {pred_time:.4f} seconds")

# Classification report
print(classification_report(y_test, y_pred, digits=3))

⏱ Training time: 1.88 seconds
⏱ Prediction time: 0.0190 seconds
              precision    recall  f1-score   support

           0      0.940     0.798     0.863      2638
           1      0.218     0.525     0.308       284

    accuracy                          0.771      2922
   macro avg      0.579     0.661     0.586      2922
weighted avg      0.870     0.771     0.809      2922



#### Hyperparameter tuning

In [262]:
import optuna

# Objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 400, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "random_state": 42,
        "n_jobs": -1,
        "scale_pos_weight": (y_train == 0).sum() / (y_train == 1).sum(),
        "eval_metric": "logloss"
    }
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    y_pred = (proba >= THRESHOLD).astype(int)  # Keep your tuned threshold
    return recall_score(y_test, y_pred, pos_label=1)  # Optimize recall for churners

# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=35)

print("Best Params:", study.best_params)
print("Best Recall:", study.best_value)

[I 2025-10-16 21:06:02,376] A new study created in memory with name: no-name-9f1cbff8-54ff-459e-a145-5960b37c128f
[I 2025-10-16 21:06:04,173] Trial 0 finished with value: 0.6408450704225352 and parameters: {'n_estimators': 659, 'learning_rate': 0.04351398259527294, 'max_depth': 9, 'subsample': 0.8130760142898295, 'colsample_bytree': 0.6691436511975818, 'min_child_weight': 4, 'gamma': 4.541481992372517, 'reg_alpha': 4.799697496324945, 'reg_lambda': 1.9439545644515759}. Best is trial 0 with value: 0.6408450704225352.
[I 2025-10-16 21:06:06,437] Trial 1 finished with value: 0.36971830985915494 and parameters: {'n_estimators': 710, 'learning_rate': 0.18392757758654427, 'max_depth': 6, 'subsample': 0.8854254690958993, 'colsample_bytree': 0.8481948366866683, 'min_child_weight': 7, 'gamma': 0.18525553283064045, 'reg_alpha': 1.1491013849670573, 'reg_lambda': 3.4190969611959243}. Best is trial 0 with value: 0.6408450704225352.
[I 2025-10-16 21:06:07,763] Trial 2 finished with value: 0.869718309

Best Params: {'n_estimators': 469, 'learning_rate': 0.010748928426480944, 'max_depth': 3, 'subsample': 0.5327214483471154, 'colsample_bytree': 0.9882366435945533, 'min_child_weight': 10, 'gamma': 1.3715232509864093, 'reg_alpha': 4.918072470609218, 'reg_lambda': 4.998153679890343}
Best Recall: 0.971830985915493


In [260]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import time

# Calculate scale_pos_weight for imbalance
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# Add the scale_pos_weight and fixed params to the best ones from Optuna
best_params = study.best_params
best_params.update({
    "random_state": 42,
    "n_jobs": -1,
    "scale_pos_weight": scale_pos_weight,
    "eval_metric": "logloss"
})

# Create model from best params
xgb = XGBClassifier(**best_params)

# Training timer
start_train = time.time()
xgb.fit(X_train, y_train)
train_time = time.time() - start_train
print(f"⏱ Training time: {train_time:.2f} seconds")

# Prediction timer
start_pred = time.time()
proba = xgb.predict_proba(X_test)[:, 1]
y_pred = (proba >= THRESHOLD).astype(int)
pred_time = time.time() - start_pred
print(f"⏱ Prediction time: {pred_time:.4f} seconds")

# Classification report
print(classification_report(y_test, y_pred, digits=3))

⏱ Training time: 0.73 seconds
⏱ Prediction time: 0.0092 seconds
              precision    recall  f1-score   support

           0      0.982     0.083     0.154      2638
           1      0.104     0.986     0.188       284

    accuracy                          0.171      2922
   macro avg      0.543     0.535     0.171      2922
weighted avg      0.897     0.171     0.157      2922



In [269]:
import mlflow
import mlflow.sklearn  # or mlflow.xgboost
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, roc_auc_score
import time
import os

# Force MLflow to always use the project root's mlruns folder
mlruns_path = os.path.join(os.getcwd(), "mlruns").replace("\\", "/")
mlflow.set_tracking_uri(f"file:///{mlruns_path}")

experiment_name = "EkoPower Churn - XGBoost"
exp = mlflow.get_experiment_by_name(experiment_name)
experiment_id = exp.experiment_id if exp else mlflow.create_experiment(experiment_name)


with mlflow.start_run(run_name="xgb_notebook", experiment_id=experiment_id):
    # Calculate scale_pos_weight
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    # Best params from Optuna
    best_params = study.best_params
    best_params.update({
        "random_state": 42,
        "n_jobs": -1,
        "scale_pos_weight": scale_pos_weight,
        "eval_metric": "logloss"
    })

    # Log parameters
    mlflow.log_params(best_params)

    # Training timer
    start_train = time.time()
    xgb = XGBClassifier(**best_params)
    xgb.fit(X_train, y_train)
    train_time = time.time() - start_train
    mlflow.log_metric("train_time", train_time)

    # Prediction
    start_pred = time.time()
    proba = xgb.predict_proba(X_test)[:, 1]
    y_pred = (proba >= THRESHOLD).astype(int)
    pred_time = time.time() - start_pred
    mlflow.log_metric("pred_time", pred_time)

    # Metrics
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    auc = roc_auc_score(y_test, proba)

    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("roc_auc", auc)

    # Save model
    mlflow.xgboost.log_model(xgb, "model")

    print(classification_report(y_test, y_pred, digits=3))



              precision    recall  f1-score   support

           0      0.970     0.099     0.180      2638
           1      0.104     0.972     0.188       284

    accuracy                          0.184      2922
   macro avg      0.537     0.536     0.184      2922
weighted avg      0.886     0.184     0.181      2922

