In [85]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import chi2
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=20)

In [86]:
df = pd.read_csv('telko-churn-bersih.csv')

### Data Preprocessing

#### Encoding

In [87]:
df_encode = df.copy()
X = df_encode.drop('Churn', axis=1)
y = df_encode['Churn']

#### chi2 score

In [88]:
y = y.replace({'No':'no', 'Yes':'yes'})
y.value_counts()

no     5164
yes    1857
Name: Churn, dtype: int64

In [89]:
label_encoder = LabelEncoder()
for kolom in X.dtypes[X.dtypes=='object'].index:
    X[kolom] = label_encoder.fit_transform(X[kolom])
    print(kolom, ':', label_encoder.classes_)

gender : ['Female' 'Male']
SeniorCitizen : ['No' 'Yes']
Partner : ['No' 'Yes']
Dependents : ['No' 'Yes']
PhoneService : ['No' 'Yes']
MultipleLines : ['No' 'Yes']
InternetService : ['DSL' 'Fiber optic' 'No']
OnlineSecurity : ['No' 'Yes']
OnlineBackup : ['No' 'Yes']
DeviceProtection : ['No' 'Yes']
TechSupport : ['No' 'Yes']
StreamingTV : ['No' 'Yes']
StreamingMovies : ['No' 'Yes']
Contract : ['Month-to-month' 'One year' 'Two year']
PaperlessBilling : ['No' 'Yes']
PaymentMethod : ['Bank transfer (automatic)' 'Credit card (automatic)' 'Electronic check'
 'Mailed check']
Protection : ['No' 'Yes']


In [90]:
y.value_counts()

no     5164
yes    1857
Name: Churn, dtype: int64

In [91]:
chi_score = chi2(X,y)
p_val = pd.Series(chi_score[1], index=X.columns).sort_values(ascending=False)
fig = px.bar(x=p_val.index, y = p_val.values, width=800, height=400)
fig.update_xaxes(title = 'columns')
fig.update_yaxes(title = 'p-values')
fig.add_hrect(y0=0.05, y1=0.05, opacity = 0.5)
fig.show()

In [92]:
X = df_encode.drop(['Churn', 'PhoneService', 'gender'], axis=1)
y = df_encode['Churn']
y = y.replace({'No':'no', 'Yes':'yes'})

#### Encodeing dan Normalize

In [93]:
label_encoder = LabelEncoder()
for kolom in X.dtypes[X.dtypes=='object'].index:
    X[kolom] = label_encoder.fit_transform(X[kolom])
    print(kolom, ':', label_encoder.classes_)

SeniorCitizen : ['No' 'Yes']
Partner : ['No' 'Yes']
Dependents : ['No' 'Yes']
MultipleLines : ['No' 'Yes']
InternetService : ['DSL' 'Fiber optic' 'No']
OnlineSecurity : ['No' 'Yes']
OnlineBackup : ['No' 'Yes']
DeviceProtection : ['No' 'Yes']
TechSupport : ['No' 'Yes']
StreamingTV : ['No' 'Yes']
StreamingMovies : ['No' 'Yes']
Contract : ['Month-to-month' 'One year' 'Two year']
PaperlessBilling : ['No' 'Yes']
PaymentMethod : ['Bank transfer (automatic)' 'Credit card (automatic)' 'Electronic check'
 'Mailed check']
Protection : ['No' 'Yes']


In [94]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0,1.0,0.0,0.013889,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.666667,0.115423,0.003437,0.142857,1.0
1,0.0,0.0,0.0,0.472222,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.5,0.0,1.000000,0.385075,0.217564,0.428571,1.0
2,0.0,0.0,0.0,0.027778,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.354229,0.012453,0.428571,1.0
3,0.0,0.0,0.0,0.625000,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.5,0.0,0.000000,0.239303,0.211951,0.428571,1.0
4,0.0,0.0,0.0,0.027778,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.666667,0.521891,0.017462,0.142857,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7016,0.0,1.0,1.0,0.333333,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.5,1.0,1.000000,0.662189,0.229194,0.857143,1.0
7017,0.0,1.0,1.0,1.000000,1.0,0.5,0.0,1.0,1.0,0.0,1.0,1.0,0.5,1.0,0.333333,0.845274,0.847792,0.714286,1.0
7018,0.0,1.0,1.0,0.152778,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.666667,0.112935,0.039892,0.142857,1.0
7019,1.0,1.0,0.0,0.055556,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.000000,0.558706,0.035303,0.142857,0.0


#### Train Test Split

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2, stratify=y)

In [96]:
y_train.value_counts(True)*100

no     73.54226
yes    26.45774
Name: Churn, dtype: float64

In [97]:
y_test.value_counts(True)*100

no     73.57631
yes    26.42369
Name: Churn, dtype: float64

1. Buatlah model regresi logistik dan xgboost untuk data ini 
2. Lakukan validasi berdasarkan hasil kfold. apakah hasil setiap fold nya konsisten? 
3. Berdasarkan confusion matrix, berapa precision dan recall yang dihasilkan masingmasing model?
4. Model manakah yang dirasa lebih baik untuk memodelkan data ini?
- Model Logistik Regression

### Modelling

In [98]:
def cross_val(X, y, model, skf, is_xgb=False, threshold=0.5):
    cv = {'tn':[], 'fp':[], 'fn':[], 'tp':[], 
        'prec_yes':[], 'recall_yes':[], 'prec_no':[], 'recall_no':[], 
        'accuracy':[], 'f1_score':[]}
    for train, val in skf.split(X, y):
        X_cv_train, X_cv_val = X[train], X[val]
        y_cv_train, y_cv_val = y.iloc[train], y.iloc[val]
        model.fit(X_cv_train, y_cv_train)
        if is_xgb:
            y_cv_pred = np.where(model.predict_proba(X_cv_val)[:,1] >= threshold, 1, 0)
        else: 
            y_cv_pred = np.where(model.predict_proba(X_cv_val)[:,1] >= threshold, 'yes', 'no')
        cm_ravel = confusion_matrix(y_cv_val, y_cv_pred).ravel() 
        for i, cm_i in enumerate(['tn', 'fp', 'fn', 'tp']):
            cv[cm_i].append(cm_ravel[i])
        cr = classification_report(y_cv_val, y_cv_pred, output_dict=True)
        if is_xgb:
            label = ['1', '0']
        else: 
            label = ['yes','no']
        cv['prec_yes'].append(cr[label[0]]['precision'])
        cv['recall_yes'].append(cr[label[0]]['recall'])
        cv['prec_no'].append(cr[label[1]]['precision'])
        cv['recall_no'].append(cr[label[1]]['recall'])
        cv['accuracy'].append(cr['accuracy'])
        cv['f1_score'].append(cr[label[0]]['f1-score'])
    df_cv = pd.DataFrame(cv)
    df_mean = pd.DataFrame([['']*4 + list(df_cv.mean().values[4:])], columns=df_cv.columns, index=['mean'])
    df_std_dev = pd.DataFrame([['']*4 + list(df_cv.std().values[4:])], columns=df_cv.columns, index=['std.dev'])
    return pd.concat([df_cv, df_mean, df_std_dev])

def check_model(model, threshold=0.5, is_xgb = False):
    if is_xgb :
        model.fit(X_train, y_train.replace({'yes':1,'no':0}))
        label = np.array(['no', 'yes'], dtype='object')
    else:
        model.fit(X_train, y_train)
        label = model.classes_
    y_pred = np.where(model.predict_proba(X_test)[:,1]>=threshold, 'yes', 'no')
    fig = px.imshow(pd.DataFrame(confusion_matrix(y_test, y_pred), 
            columns='Predicted '+label, index = 'actual '+ label),
            text_auto=True, width=500, aspect='auto', color_continuous_scale=px.colors.sequential.Emrld)
    fig.show()
    print(classification_report(y_test, y_pred))

### Regresi Logistik

In [99]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=200)

In [100]:
cross_val(X_train, y_train, log_reg, skf, threshold=0.5)

Unnamed: 0,tn,fp,fn,tp,prec_yes,recall_yes,prec_no,recall_no,accuracy,f1_score
0,878.0,90.0,162.0,187.0,0.67509,0.535817,0.844231,0.907025,0.808656,0.597444
1,880.0,88.0,166.0,182.0,0.674074,0.522989,0.8413,0.909091,0.806991,0.588997
2,861.0,107.0,163.0,185.0,0.633562,0.531609,0.84082,0.889463,0.794833,0.578125
3,873.0,95.0,153.0,195.0,0.672414,0.560345,0.850877,0.90186,0.81155,0.611285
mean,,,,,0.663785,0.53769,0.844307,0.90186,0.805507,0.593963
std.dev,,,,,0.020179,0.016019,0.004632,0.008806,0.007362,0.013996


In [101]:
check_model(log_reg, threshold=0.5)

              precision    recall  f1-score   support

          no       0.83      0.88      0.86      1292
         yes       0.60      0.49      0.54       464

    accuracy                           0.78      1756
   macro avg       0.72      0.69      0.70      1756
weighted avg       0.77      0.78      0.77      1756



### XGBoost

In [102]:
from xgboost import XGBClassifier
XGB = XGBClassifier(max_depth=6, alpha=0.1, gamma=0.01, n_estimators=100, random_state=21, seed=1)

In [103]:
cross_val(X_train, y_train.replace({'yes':1, 'no':0}), XGB, skf, is_xgb=True)

Unnamed: 0,tn,fp,fn,tp,prec_yes,recall_yes,prec_no,recall_no,accuracy,f1_score
0,864.0,104.0,165.0,184.0,0.638889,0.527221,0.83965,0.892562,0.795748,0.577708
1,864.0,104.0,163.0,185.0,0.640138,0.531609,0.841285,0.892562,0.797112,0.580848
2,845.0,123.0,167.0,181.0,0.595395,0.520115,0.83498,0.872934,0.779635,0.555215
3,853.0,115.0,157.0,191.0,0.624183,0.548851,0.844554,0.881198,0.793313,0.584098
mean,,,,,0.624651,0.531949,0.840118,0.884814,0.791452,0.574467
std.dev,,,,,0.020806,0.012223,0.003986,0.009562,0.008033,0.013097


In [104]:
check_model(XGB, threshold=0.25, is_xgb=True)

              precision    recall  f1-score   support

          no       0.88      0.74      0.80      1292
         yes       0.50      0.72      0.59       464

    accuracy                           0.73      1756
   macro avg       0.69      0.73      0.70      1756
weighted avg       0.78      0.73      0.75      1756



### Hyper Parameter Tuning

#### Regresi Logistik

In [105]:
param_grid = dict(
    solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    penalty = ['none', 'l1', 'l2', 'elasticnet'],
    C = [100, 10, 1.0, 0.1, 0.01]
)
score = ['precision', 'recall']
grid_search = GridSearchCV(LogisticRegression(), param_grid=param_grid, cv=skf)
grid_search.fit(X_train, y_train)


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.


Setting penalty=None will ignore the C and l1_ratio parameters


`penalty='none'`has been deprecated in 1.2 and will be removed in 1.4

In [106]:
print("Tuned Hyperparameters :", grid_search.best_params_)
print("Accuracy :",grid_search.best_score_)

Tuned Hyperparameters : {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy : 0.8085465551024364


In [107]:
logreg = LogisticRegression(C = 0.1, penalty = 'l1', solver = 'liblinear')
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_test)
dfcm = pd.DataFrame(confusion_matrix(y_test, y_pred), 
                    columns='Predicted\n'+logreg.classes_,
                     index='Actual\n'+logreg.classes_)
px.imshow(dfcm, text_auto=True, aspect='auto', width=600, color_continuous_scale=px.colors.sequential.Emrld)

In [None]:
clasification

#### XGBoost

In [108]:
param_grid = {
    'n-estimator':[100,200,500],
    'learning_rate':[0.01,0.05,0.1],
    'booster':['gbtree', 'gblinear'],
    'gamma':[0,0.5,1],
    'reg_alpha':[0.5,1,5],
    'reg_lambda':[0.5,1,5],
    'base_score':[0.2,0.5,1]
}
grid_search2 = GridSearchCV(XGBClassifier(), param_grid, n_jobs=1, cv=skf)
y_train_xgb = y_train.replace({'yes':1, 'no':0})
grid_search2.fit(X_train, y_train_xgb)

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters: { "n-estimator" } are not used.

Parameters

In [None]:
print("Tuned Hyperparameters :", grid_search.best_params_)
print("Accuracy :",grid_search.best_score_)