In [3]:
from catboost import Pool, CatBoostClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [4]:
df = pd.read_csv('data.csv')

In [5]:
df = df.drop(['customerID'], axis=1)

In [6]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
le = preprocessing.LabelEncoder()

cols = []
for col in df.columns:
    if df[col].dtype == 'object':
        cols.append(col)
df[cols] = df[cols].apply(le.fit_transform)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,2505,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1466,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,157,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1400,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,925,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int32  
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int32  
 3   Dependents        7043 non-null   int32  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int32  
 6   MultipleLines     7043 non-null   int32  
 7   InternetService   7043 non-null   int32  
 8   OnlineSecurity    7043 non-null   int32  
 9   OnlineBackup      7043 non-null   int32  
 10  DeviceProtection  7043 non-null   int32  
 11  TechSupport       7043 non-null   int32  
 12  StreamingTV       7043 non-null   int32  
 13  StreamingMovies   7043 non-null   int32  
 14  Contract          7043 non-null   int32  
 15  PaperlessBilling  7043 non-null   int32  
 16  PaymentMethod     7043 non-null   int32  


In [9]:
# Используем StandardScaler для нормализации данных.
scaler = StandardScaler()

In [10]:
df[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(df[['tenure', 'MonthlyCharges', 'TotalCharges']])
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Churn'], axis=1), 
                                                    df['Churn'], test_size=0.30, 
                                                    random_state=42, stratify=df['Churn'])

score_all_models = {}

In [11]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
5557,0,0,0,0,-1.114563,1,0,1,0,0,0,0,2,0,0,0,2,0.513107,0.014935
2270,0,1,0,0,-1.196004,1,0,1,0,0,2,0,2,0,0,1,2,0.734126,-0.756023
6930,0,0,1,0,-1.196004,1,2,1,0,0,0,0,0,0,0,1,1,0.345265,-0.784616
2257,0,0,0,0,1.125057,1,2,0,0,0,2,2,2,2,1,0,1,0.524739,0.439068
898,0,0,0,0,-0.829521,1,0,1,2,0,0,2,2,2,0,1,0,1.134619,-1.602170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4250,1,0,0,0,1.247218,1,2,1,2,2,2,2,2,0,1,0,1,1.320740,1.061235
1488,1,0,0,0,-1.277445,1,0,0,2,0,0,0,0,0,0,0,3,-0.449074,0.552911
6303,0,0,1,0,1.572981,1,2,1,0,2,2,2,2,2,2,0,2,1.478611,1.384233
2710,0,0,1,0,-0.340876,1,0,2,1,1,1,1,1,1,1,0,1,-1.474403,0.431125


# Testing Default Params

In [12]:
lgbm = lgb.LGBMClassifier()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
score = f1_score(y_test, y_pred)
print(score)
score_all_models['LGBM_default'] = score

0.5488647581441264


In [13]:
cbr = CatBoostClassifier(logging_level='Silent')
cbr.fit(X_train, y_train)
y_pred = cbr.predict(X_test)
score = f1_score(y_test, y_pred)
print(score)
score_all_models['CatBoost_default'] = score

0.5576730190571715


In [14]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
score = f1_score(y_test, y_pred)
print(score)
score_all_models['XGB_default'] = score

0.5363020329138432


In [15]:
skl = GradientBoostingClassifier().fit(X_train, y_train)
y_pred = skl.predict(X_test)
score = f1_score(y_test, y_pred)
print(score)
score_all_models['GradBoostSklearn_default'] = score

0.5448979591836735


# Testing GridSearchCV

In [16]:
from sklearn.model_selection import GridSearchCV

def estimate_grid_search(model, grid, scores, X=X_train, y=y_train, name=None):
    clf = GridSearchCV(model, grid, verbose = 1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Best params: {clf.best_params_}")
    score = f1_score(y_test, y_pred)
    print(score)
    scores[name] = score

In [15]:
%%time
grid = {'learning_rate': [i for i in np.arange(0.01,0.05,0.01)],
        'depth': [i for i in range(2,11)],
        'iterations': [i for i in range(10,101,10)],
        'l2_leaf_reg': [3, 4, 5, 7, 10]
       }
cbc = CatBoostClassifier(logging_level='Silent')
estimate_grid_search(cbc, grid, X=X_train, y=y_train, name='CatBoost_tuned')

Fitting 5 folds for each of 1800 candidates, totalling 9000 fits
Best params: {'depth': 6, 'iterations': 90, 'l2_leaf_reg': 10, 'learning_rate': 0.04}
0.5679012345679012
CPU times: user 1h 17min 37s, sys: 19min 14s, total: 1h 36min 51s
Wall time: 16min 13s


In [17]:
%%time
grid = {'learning_rate': [i for i in np.arange(0.01,0.05,0.01)],
        'max_depth': [-1] + [i for i in range(2,11)],
        'reg_lambda': [1, 3, 5, 7, 9],
        'n_estimators': [i for i in range(10,101,10)]}
lgbm = lgb.LGBMClassifier()
estimate_grid_search(lgbm, grid, score_all_models, X=X_train, y=y_train, name='LGBM_tuned')

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits
Best params: {'learning_rate': 0.04, 'max_depth': 3, 'n_estimators': 100, 'reg_lambda': 9}
0.5515789473684211
CPU times: user 39min 13s, sys: 42min 28s, total: 1h 21min 42s
Wall time: 18min 30s


In [18]:
%%time
grid = {'learning_rate': [i for i in np.arange(0.01,0.05,0.01)],
        'n_estimators': [i for i in range(10,101,10)],
        'max_depth': [i for i in range(2,11)],
        'subsample':[0.2, 0.5, 0.7]}
skl = GradientBoostingClassifier()
estimate_grid_search(skl, grid, score_all_models, X=X_train, y=y_train, name='GradBoostSklearn_tuned')

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
Best params: {'learning_rate': 0.04, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.5}
0.5471502590673576
CPU times: user 20min 16s, sys: 0 ns, total: 20min 16s
Wall time: 20min 17s


In [17]:
%%time
grid = {'learning_rate': [i for i in np.arange(0.01,0.05,0.01)],
        'n_estimators': [i for i in range(10,101,10)],
        'max_depth': [i for i in range(2,11)],
        'subsample':[0.2, 0.5, 0.7]}
xgb = XGBClassifier(verbosity=0, use_label_encoder=False)
estimate_grid_search(xgb, grid, score_all_models, X=X_train, y=y_train, name='XGB_tuned')

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits
Best params: {'learning_rate': 0.04, 'max_depth': 6, 'n_estimators': 70, 'subsample': 0.5}
0.5714285714285714
CPU times: total: 1h 25min 5s
Wall time: 6min 33s


### Вывод: 
Среди моделей с параметрами "по умолчанию" наивысший результат показал CatBoostClassifier. Наилучший резутьтат f1_score показал XGBClassifier с поиском параметров по сетке