In [1]:
#imports

import pandas as pd
from functools import reduce

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier 


In [61]:
#load data
client_date_cols = ['date_activ', 'date_end', 'date_modif_prod', 'date_renewal']
client_data = pd.read_csv("../data/raw/client_data.csv", parse_dates = client_date_cols)
price_data = pd.read_csv("../data/raw/price_data.csv", parse_dates = ['price_date'])

### Feature engineering

Add features of price differences between December and the preceding January for peak, mid-peak and off-peak prices. 

In [104]:
## off-peak
mon_price_by_com = price_data.groupby(['id', 'price_date']).agg({'price_off_peak_var': 'mean',
                                                                 'price_off_peak_fix': 'mean'}).reset_index()
jan_prices = mon_price_by_com.groupby('id').first().reset_index()
dec_prices = mon_price_by_com.groupby('id').last().reset_index()

jan_prices = jan_prices.rename(columns = {'price_off_peak_var': 'eng_jan', 'price_off_peak_fix': 'pow_jan'})
dec_prices = dec_prices.rename(columns = {'price_off_peak_var': 'eng_dec', 'price_off_peak_fix': 'pow_dec'})
diff_off = pd.merge(jan_prices, dec_prices.drop(columns = 'price_date'), on = 'id')
diff_off['offpeak_dec_jan_diff_eng'] = diff_off['eng_dec'] - diff_off['eng_jan']
diff_off['offpeak_dec_jan_diff_pow'] = diff_off['pow_dec'] - diff_off['pow_jan']
diff_off = diff_off[['id', 'offpeak_dec_jan_diff_eng', 'offpeak_dec_jan_diff_pow']]
diff_off.head()

Unnamed: 0,id,offpeak_dec_jan_diff_eng,offpeak_dec_jan_diff_pow
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,0.162916
4,00114d74e963e47177db89bc70108537,-0.003994,-1e-06


In [105]:
## mid

mon_price_by_com_mid = price_data.groupby(['id', 'price_date']).agg({'price_mid_peak_var': 'mean',
                                                                 'price_mid_peak_fix': 'mean'}).reset_index()

jan_prices = mon_price_by_com_mid.groupby('id').first().reset_index()
dec_prices = mon_price_by_com_mid.groupby('id').last().reset_index()

jan_prices = jan_prices.rename(columns = {'price_mid_peak_var': 'eng_jan', 'price_mid_peak_fix': 'pow_jan'})
dec_prices = dec_prices.rename(columns = {'price_mid_peak_var': 'eng_dec', 'price_mid_peak_fix': 'pow_dec'})
diff_mid = pd.merge(jan_prices, dec_prices.drop(columns = 'price_date'), on = 'id')
diff_mid['midpeak_dec_jan_diff_eng'] = diff_mid['eng_dec'] - diff_mid['eng_jan']
diff_mid['midpeak_dec_jan_diff_pow'] = diff_mid['pow_dec'] - diff_mid['pow_jan']
diff_mid = diff_mid[['id', 'midpeak_dec_jan_diff_eng', 'midpeak_dec_jan_diff_pow']]
diff_mid.head()

Unnamed: 0,id,midpeak_dec_jan_diff_eng,midpeak_dec_jan_diff_pow
0,0002203ffbb812588b632b9e628cc38d,0.003487,0.065166
1,0004351ebdd665e6ee664792efc4fd13,0.0,0.0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.0,0.0
3,0010ee3855fdea87602a5b7aba8e42de,0.000763,0.065166
4,00114d74e963e47177db89bc70108537,0.0,0.0


In [106]:
## peak
mon_price_by_com_peak = price_data.groupby(['id', 'price_date']).agg({'price_peak_var': 'mean',
                                                                 'price_peak_fix': 'mean'}).reset_index()
jan_prices = mon_price_by_com_peak.groupby('id').first().reset_index()
dec_prices = mon_price_by_com_peak.groupby('id').last().reset_index()

jan_prices = jan_prices.rename(columns = {'price_peak_var': 'eng_jan', 'price_peak_fix': 'pow_jan'})
dec_prices = dec_prices.rename(columns = {'price_peak_var': 'eng_dec', 'price_peak_fix': 'pow_dec'})
diff_peak = pd.merge(jan_prices, dec_prices.drop(columns = 'price_date'), on = 'id')
diff_peak['peak_dec_jan_diff_eng'] = diff_peak['eng_dec'] - diff_peak['eng_jan']
diff_peak['peak_dec_jan_diff_pow'] = diff_peak['pow_dec'] - diff_peak['pow_jan']
diff_peak = diff_peak[['id', 'peak_dec_jan_diff_eng', 'peak_dec_jan_diff_pow']]
diff_peak.head()

Unnamed: 0,id,peak_dec_jan_diff_eng,peak_dec_jan_diff_pow
0,0002203ffbb812588b632b9e628cc38d,-0.002302,0.097749
1,0004351ebdd665e6ee664792efc4fd13,0.0,0.0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.0,0.0
3,0010ee3855fdea87602a5b7aba8e42de,-0.00512,0.097749
4,00114d74e963e47177db89bc70108537,0.0,0.0


In [109]:
diff_dfs = [diff_off, diff_mid, diff_peak]
diff = reduce(lambda left, right: pd.merge(left, right, on='id'), diff_dfs)
diff.head()

Unnamed: 0,id,offpeak_dec_jan_diff_eng,offpeak_dec_jan_diff_pow,midpeak_dec_jan_diff_eng,midpeak_dec_jan_diff_pow,peak_dec_jan_diff_eng,peak_dec_jan_diff_pow
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916,0.003487,0.065166,-0.002302,0.097749
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779,0.0,0.0,0.0,0.0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5,0.0,0.0,0.0,0.0
3,0010ee3855fdea87602a5b7aba8e42de,-0.010018,0.162916,0.000763,0.065166,-0.00512,0.097749
4,00114d74e963e47177db89bc70108537,-0.003994,-1e-06,0.0,0.0,0.0,0.0


In [112]:
#merge data
join_data = pd.merge(diff, client_data, on = 'id')
join_data.head()

Unnamed: 0,id,offpeak_dec_jan_diff_eng,offpeak_dec_jan_diff_pow,midpeak_dec_jan_diff_eng,midpeak_dec_jan_diff_pow,peak_dec_jan_diff_eng,peak_dec_jan_diff_pow,channel_sales,cons_12m,cons_gas_12m,...,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,churn
0,0002203ffbb812588b632b9e628cc38d,-0.006192,0.162916,0.003487,0.065166,-0.002302,0.097749,foosdfpfkusacimwkcsosbicdxkicaua,22034,0,...,f,40.78,43.08,43.08,1,81.42,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,17.25,0
1,0004351ebdd665e6ee664792efc4fd13,-0.004104,0.177779,0.0,0.0,0.0,0.0,MISSING,4060,0,...,f,0.0,24.42,24.42,1,61.58,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.2,0
2,0010bcc39e42b3c2131ed2ce55246e3c,0.050443,1.5,0.0,0.0,0.0,0.0,usilxuppasemubllopkaafesmlibmsdf,7440,0,...,f,213.76,38.58,38.58,2,81.61,3,lxidpiddsbxsbosboudacockeimpuepw,13.856,0
3,00114d74e963e47177db89bc70108537,-0.003994,-1e-06,0.0,0.0,0.0,0.0,ewpakwlliwisiwduibdlfmalxowmwpci,11272,0,...,f,0.0,29.76,29.76,1,157.99,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.2,0
4,0013f326a839a2f6ad87a1859952d227,-0.006171,0.0,0.003371,0.0,-0.002351,0.0,foosdfpfkusacimwkcsosbicdxkicaua,267414,0,...,f,195.2,30.0,30.0,1,341.58,3,lxidpiddsbxsbosboudacockeimpuepw,20.0,0


### Modeling

In [113]:
#train-test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(join_data, test_size = 0.2, random_state = 42)
target = 'churn'
X_train, y_train = train_df.drop(columns = [target]), train_df[target]
X_test, y_test = test_df.drop(columns = [target]), train_df[target]

In [114]:
X_train.shape

(11684, 31)

In [115]:
X_test.shape

(2922, 31)

In [116]:
#define columns
join_cols = join_data.columns.tolist()
target = ['churn']

#exclude these features for now; date columns need feature enginnering; forecast columns introduce uncertainty
date_feats = [col for col in join_cols if col.startswith('date')]
forecast_feats = [col for col in join_cols if col.startswith('forecast')]

categorical_feats = ['channel_sales', 'origin_up' ]
binary_feats = ['has_gas']
drop_feats = ['id']
numerical_feats = list(set(join_cols) - 
                       set(date_feats) - 
                       set(categorical_feats) - 
                       set(binary_feats) - 
                       set(forecast_feats) - 
                       set(target)-
                       set(drop_feats))

#check numerical features
numerical_feats

['net_margin',
 'peak_dec_jan_diff_pow',
 'imp_cons',
 'margin_net_pow_ele',
 'cons_12m',
 'nb_prod_act',
 'offpeak_dec_jan_diff_eng',
 'num_years_antig',
 'cons_gas_12m',
 'offpeak_dec_jan_diff_pow',
 'pow_max',
 'margin_gross_pow_ele',
 'midpeak_dec_jan_diff_eng',
 'peak_dec_jan_diff_eng',
 'midpeak_dec_jan_diff_pow',
 'cons_last_month']

In [117]:
#encode

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)

preprocessor = make_column_transformer(
    (OneHotEncoder(drop ='if_binary'), binary_feats),
    (OneHotEncoder(handle_unknown = 'ignore', sparse = False), categorical_feats), 
    ("passthrough", numerical_feats)
)

In [118]:
#define scoring function

def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and standard deviation from cross validation.

    Parameters
    ----------
    model :
        model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data
    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)
    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []
    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))
    return pd.Series(data=out_col, index=mean_scores.index)

In [84]:
xgb = XGBClassifier(random_state = 42, verbosity = 0, use_label_encoder = False)
pipe_xgb = make_pipeline(preprocessor, xgb)

In [119]:
results = {}
scoring_metric = ['accuracy', 'precision', 'recall', 'f1']
results["xgboost"] = mean_std_cross_val_scores(pipe_xgb, X_train, y_train, return_train_score = True, scoring = scoring_metric)
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1
xgb,0.703 (+/- 0.013),0.009 (+/- 0.001),0.904 (+/- 0.004),0.963 (+/- 0.002),0.515 (+/- 0.109),0.999 (+/- 0.001),0.086 (+/- 0.020),0.618 (+/- 0.025),0.147 (+/- 0.032),0.764 (+/- 0.019)


In [122]:
clf = RandomForestClassifier(random_state = 42, max_features = 26) #max features determined after looking at feature importances 
pipe_rf = make_pipeline(preprocessor, clf)

In [123]:
results["random forest"] = mean_std_cross_val_scores(pipe_rf, X_train, y_train, return_train_score = True, scoring = scoring_metric)
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_accuracy,train_accuracy,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1
xgb,0.703 (+/- 0.013),0.009 (+/- 0.001),0.904 (+/- 0.004),0.963 (+/- 0.002),0.515 (+/- 0.109),0.999 (+/- 0.001),0.086 (+/- 0.020),0.618 (+/- 0.025),0.147 (+/- 0.032),0.764 (+/- 0.019)
random forest,4.981 (+/- 0.159),0.034 (+/- 0.000),0.907 (+/- 0.002),1.000 (+/- 0.000),0.664 (+/- 0.095),1.000 (+/- 0.000),0.065 (+/- 0.014),0.999 (+/- 0.001),0.118 (+/- 0.024),1.000 (+/- 0.001)
