In [1]:
from catboost import Pool, CatBoostClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df = df.drop(['customerID'], axis=1)

In [4]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
le = preprocessing.LabelEncoder()

cols = []
for col in df.columns:
    if df[col].dtype == 'object':
        cols.append(col)
df[cols] = df[cols].apply(le.fit_transform)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,2505,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1466,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,157,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1400,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,925,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int32  
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int32  
 3   Dependents        7043 non-null   int32  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int32  
 6   MultipleLines     7043 non-null   int32  
 7   InternetService   7043 non-null   int32  
 8   OnlineSecurity    7043 non-null   int32  
 9   OnlineBackup      7043 non-null   int32  
 10  DeviceProtection  7043 non-null   int32  
 11  TechSupport       7043 non-null   int32  
 12  StreamingTV       7043 non-null   int32  
 13  StreamingMovies   7043 non-null   int32  
 14  Contract          7043 non-null   int32  
 15  PaperlessBilling  7043 non-null   int32  
 16  PaymentMethod     7043 non-null   int32  


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Churn'], axis=1), 
                                                    df['Churn'], test_size=0.30, 
                                                    random_state=42, stratify=df['Churn'])

In [8]:
X_train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
5557,0,0,0,0,5,1,0,1,0,0,0,0,2,0,0,0,2,80.2,3286
2270,0,1,0,0,3,1,0,1,0,0,2,0,2,0,0,1,2,86.85,1830
6930,0,0,1,0,3,1,2,1,0,0,0,0,0,0,0,1,1,75.15,1776
2257,0,0,0,0,60,1,2,0,0,0,2,2,2,2,1,0,1,80.55,4087
898,0,0,0,0,12,1,0,1,2,0,0,2,2,2,0,1,0,98.9,232


# Testing Default Params

In [9]:
lgbm = lgb.LGBMClassifier()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1552
           1       0.61      0.49      0.54       561

    accuracy                           0.78      2113
   macro avg       0.72      0.69      0.70      2113
weighted avg       0.77      0.78      0.77      2113



In [10]:
cbr = CatBoostClassifier(logging_level='Silent')
cbr.fit(X_train, y_train)
у_pred = cbr.predict(X_test)
print(classification_report(y_test, у_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1552
           1       0.64      0.50      0.56       561

    accuracy                           0.79      2113
   macro avg       0.73      0.70      0.71      2113
weighted avg       0.78      0.79      0.78      2113



In [11]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
у_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))





              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1552
           1       0.61      0.49      0.54       561

    accuracy                           0.78      2113
   macro avg       0.72      0.69      0.70      2113
weighted avg       0.77      0.78      0.77      2113



In [None]:
skl = GradientBoostingClassifier().fit(X_train, y_train)
у_pred = skl.predict(X_test)
print(classification_report(y_test, y_pred))

# Testing GridSearchCV

In [13]:
from sklearn.model_selection import GridSearchCV

def make_grid_search(model, grid, X=X_train, y=y_train):
    clf = GridSearchCV(model, grid, verbose = 1)
    clf.fit(X_train, y_train)
    у_pred = clf.predict(X_test)
    print(clf.best_params_)
    print(classification_report(y_test, у_pred))

In [18]:
grid = {'learning_rate': [0.03, 0.04, 0.045, 0.47, 0.403],
        'depth': [4, 6, 8, 10, 15],
        'l2_leaf_reg': [2, 3, 4, 5, 7, 10, 12],
        'iterations': [250, 300, 500]}
cbc = CatBoostClassifier(logging_level='Silent')
make_grid_search(cbc, grid, X=X_train, y=y_train)

Fitting 5 folds for each of 525 candidates, totalling 2625 fits


KeyboardInterrupt: 

In [15]:
grid = {'learning_rate': [0.03, 0.1],
        'max_depth': [-1, 4, 6, 8],
        'reg_lambda': [1, 3, 5, 7, 9]}
lgbm = lgb.LGBMClassifier()
make_grid_search(lgbm, grid, X=X_train, y=y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
{'learning_rate': 0.1, 'max_depth': 4, 'reg_lambda': 9}
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1552
           1       0.66      0.51      0.57       561

    accuracy                           0.80      2113
   macro avg       0.75      0.71      0.72      2113
weighted avg       0.79      0.80      0.79      2113



In [16]:
grid = {'learning_rate': [0.03, 0.1],
        'n_estimators': [100, 200, 500],
        'max_depth': [1, 4, 6, 8],
        'subsample':[0.2, 0.5, 0.7]}
skl = GradientBoostingClassifier()
make_grid_search(skl, grid, X=X_train, y=y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.7}
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1552
           1       0.63      0.51      0.56       561

    accuracy                           0.79      2113
   macro avg       0.73      0.70      0.71      2113
weighted avg       0.78      0.79      0.78      2113



In [17]:
grid = {'learning_rate': [0.03, 0.1],
        'n_estimators': [100, 200, 500],
        'max_depth': [1, 4, 6, 8],
        'subsample':[0.2, 0.5, 0.7]}
xgb = XGBClassifier(verbosity=0)
make_grid_search(xgb, grid, X=X_train, y=y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits








































{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 200, 'subsample': 0.7}
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1552
           1       0.65      0.52      0.57       561

    accuracy                           0.80      2113
   macro avg       0.74      0.71      0.72      2113
weighted avg       0.79      0.80      0.79      2113

