In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [3]:
print(f"Duplicated rows: {df.duplicated(keep=False).sum()}")
print(f"Duplicated rows without target: {df.drop(['Churn'], axis=1).duplicated(keep=False).sum()}")

Duplicated rows: 28
Duplicated rows without target: 41


In [589]:
df[df.drop('Churn', axis=1).duplicated(keep=False)].sort_values(by=[*df.columns]).index

Int64Index([ 838, 1636, 4811, 3463, 4294, 3831, 1647,  594, 1474, 3735, 3749,
            3769, 1217, 2586,  881, 1874, 2568, 3660, 3239, 1677, 4735, 2024,
            2991, 1236, 4530, 1040, 5066,    7, 1959, 4373, 4509,  329, 2036,
             409, 5041,  170, 1384, 1898, 2877, 1068, 4276],
           dtype='int64')

In [590]:
df.drop([ 838, 1636, 4811, 3463, 4294, 3831, 1647,  594, 1474, 3735, 3749,
            3769, 1217, 2586,  881, 1874, 2568, 3660, 3239, 1677, 4735, 2024,
            2991, 1236, 4530, 1040, 5066,    7, 1959, 4373, 4509,  329, 2036,
             409, 5041,  170, 1384, 1898, 2877, 1068, 4276], inplace=True)

In [591]:
df.reset_index(drop=True, inplace=True)

In [592]:
df.head()

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod,Churn
0,55,19.5,1026.35,Male,0,Yes,Yes,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,0
1,72,25.85,1872.2,Male,0,Yes,No,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),0
2,1,75.9,75.9,Male,0,No,No,Yes,No,Fiber optic,No,No,No,Yes,No,No,Month-to-month,Yes,Electronic check,1
3,32,79.3,2570.0,Female,1,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Mailed check,0
4,60,115.25,6758.45,Female,0,Yes,Yes,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),0


In [593]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5241 entries, 0 to 5240
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ClientPeriod              5241 non-null   int64  
 1   MonthlySpending           5241 non-null   float64
 2   TotalSpent                5241 non-null   object 
 3   Sex                       5241 non-null   object 
 4   IsSeniorCitizen           5241 non-null   int64  
 5   HasPartner                5241 non-null   object 
 6   HasChild                  5241 non-null   object 
 7   HasPhoneService           5241 non-null   object 
 8   HasMultiplePhoneNumbers   5241 non-null   object 
 9   HasInternetService        5241 non-null   object 
 10  HasOnlineSecurityService  5241 non-null   object 
 11  HasOnlineBackup           5241 non-null   object 
 12  HasDeviceProtection       5241 non-null   object 
 13  HasTechSupportAccess      5241 non-null   object 
 14  HasOnlin

**Обработаем строчки, где записано No internet service**

In [594]:
df[df == 'No internet service'].any()

ClientPeriod                False
MonthlySpending             False
TotalSpent                  False
Sex                         False
IsSeniorCitizen             False
HasPartner                  False
HasChild                    False
HasPhoneService             False
HasMultiplePhoneNumbers     False
HasInternetService          False
HasOnlineSecurityService     True
HasOnlineBackup              True
HasDeviceProtection          True
HasTechSupportAccess         True
HasOnlineTV                  True
HasMovieSubscription         True
HasContractPhone            False
IsBillingPaperless          False
PaymentMethod               False
Churn                       False
dtype: bool

In [595]:
df_test[df_test == 'No internet service'].any()

ClientPeriod                False
MonthlySpending             False
TotalSpent                  False
Sex                         False
IsSeniorCitizen             False
HasPartner                  False
HasChild                    False
HasPhoneService             False
HasMultiplePhoneNumbers     False
HasInternetService          False
HasOnlineSecurityService     True
HasOnlineBackup              True
HasDeviceProtection          True
HasTechSupportAccess         True
HasOnlineTV                  True
HasMovieSubscription         True
HasContractPhone            False
IsBillingPaperless          False
PaymentMethod               False
dtype: bool

In [596]:
df[df.HasOnlineSecurityService == 'No internet service'].groupby(['HasInternetService']).HasOnlineSecurityService.value_counts()

HasInternetService  HasOnlineSecurityService
No                  No internet service         1118
Name: HasOnlineSecurityService, dtype: int64

In [597]:
df_test[df_test.HasOnlineSecurityService == 'No internet service'].groupby(['HasInternetService']).HasOnlineSecurityService.value_counts()

HasInternetService  HasOnlineSecurityService
No                  No internet service         385
Name: HasOnlineSecurityService, dtype: int64

In [598]:
df.loc[df.HasOnlineSecurityService == 'No internet service', 'HasOnlineSecurityService'] = 'No'
df_test.loc[df_test.HasOnlineSecurityService == 'No internet service', 'HasOnlineSecurityService'] = 'No'

In [599]:
df.loc[df.HasOnlineBackup == 'No internet service', 'HasOnlineBackup'] = 'No'
df_test.loc[df_test.HasOnlineBackup == 'No internet service', 'HasOnlineBackup'] = 'No'

In [600]:
df.loc[df.HasDeviceProtection == 'No internet service', 'HasDeviceProtection'] = 'No'
df_test.loc[df_test.HasDeviceProtection == 'No internet service', 'HasDeviceProtection'] = 'No'

In [601]:
df.loc[df.HasTechSupportAccess == 'No internet service', 'HasTechSupportAccess'] = 'No'
df_test.loc[df_test.HasTechSupportAccess == 'No internet service', 'HasTechSupportAccess'] = 'No'

In [602]:
df.loc[df.HasOnlineTV == 'No internet service', 'HasOnlineTV'] = 'No'
df_test.loc[df_test.HasOnlineTV == 'No internet service', 'HasOnlineTV'] = 'No'

In [603]:
df.loc[df.HasMovieSubscription == 'No internet service', 'HasMovieSubscription'] = 'No'
df_test.loc[df_test.HasMovieSubscription == 'No internet service', 'HasMovieSubscription'] = 'No'

**Обработаем строчки, где записано No phone service**

In [604]:
df[df == 'No phone service'].any()

ClientPeriod                False
MonthlySpending             False
TotalSpent                  False
Sex                         False
IsSeniorCitizen             False
HasPartner                  False
HasChild                    False
HasPhoneService             False
HasMultiplePhoneNumbers      True
HasInternetService          False
HasOnlineSecurityService    False
HasOnlineBackup             False
HasDeviceProtection         False
HasTechSupportAccess        False
HasOnlineTV                 False
HasMovieSubscription        False
HasContractPhone            False
IsBillingPaperless          False
PaymentMethod               False
Churn                       False
dtype: bool

In [605]:
df_test[df_test == 'No phone service'].any()

ClientPeriod                False
MonthlySpending             False
TotalSpent                  False
Sex                         False
IsSeniorCitizen             False
HasPartner                  False
HasChild                    False
HasPhoneService             False
HasMultiplePhoneNumbers      True
HasInternetService          False
HasOnlineSecurityService    False
HasOnlineBackup             False
HasDeviceProtection         False
HasTechSupportAccess        False
HasOnlineTV                 False
HasMovieSubscription        False
HasContractPhone            False
IsBillingPaperless          False
PaymentMethod               False
dtype: bool

In [606]:
df[df.HasMultiplePhoneNumbers == 'No phone service'].groupby(['HasPhoneService']).HasMultiplePhoneNumbers.value_counts()

HasPhoneService  HasMultiplePhoneNumbers
No               No phone service           521
Name: HasMultiplePhoneNumbers, dtype: int64

In [607]:
df_test[df_test.HasMultiplePhoneNumbers == 'No phone service'].groupby(['HasPhoneService']).HasMultiplePhoneNumbers.value_counts()

HasPhoneService  HasMultiplePhoneNumbers
No               No phone service           161
Name: HasMultiplePhoneNumbers, dtype: int64

In [608]:
df.loc[df.HasMultiplePhoneNumbers == 'No phone service', 'HasMultiplePhoneNumbers'] = 'No'
df_test.loc[df_test.HasMultiplePhoneNumbers == 'No phone service', 'HasMultiplePhoneNumbers'] = 'No'

**Обработаем пропуски**

In [609]:
lst = ['ClientPeriod', 'MonthlySpending', 'TotalSpent', 'Sex',
       'IsSeniorCitizen', 'HasPartner', 'HasChild', 'HasPhoneService',
       'HasMultiplePhoneNumbers', 'HasInternetService',
       'HasOnlineSecurityService', 'HasOnlineBackup', 'HasDeviceProtection',
       'HasTechSupportAccess', 'HasOnlineTV', 'HasMovieSubscription',
       'HasContractPhone', 'IsBillingPaperless', 'PaymentMethod']

In [610]:
df_test[lst[18]].value_counts()

Electronic check             579
Mailed check                 418
Bank transfer (automatic)    385
Credit card (automatic)      379
Name: PaymentMethod, dtype: int64

In [611]:
df[df.TotalSpent == ' ']

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod,Churn
1040,0,25.75,,Male,0,Yes,Yes,Yes,Yes,No,No,No,No,No,No,No,Two year,No,Mailed check,0
1691,0,73.35,,Female,0,Yes,Yes,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,0
2522,0,19.7,,Male,0,Yes,Yes,Yes,No,No,No,No,No,No,No,No,One year,Yes,Mailed check,0
3053,0,80.85,,Female,0,Yes,Yes,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,0
3669,0,20.0,,Female,0,Yes,Yes,Yes,No,No,No,No,No,No,No,No,Two year,No,Mailed check,0
3970,0,61.9,,Male,0,No,Yes,Yes,Yes,DSL,Yes,Yes,No,Yes,No,No,Two year,Yes,Bank transfer (automatic),0
4292,0,25.35,,Male,0,Yes,Yes,Yes,Yes,No,No,No,No,No,No,No,Two year,No,Mailed check,0
4514,0,52.55,,Female,0,Yes,Yes,No,No,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),0
4561,0,56.05,,Female,0,Yes,Yes,No,No,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),0


In [612]:
df_test[df_test.TotalSpent == ' ']

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod
177,0,19.85,,Male,0,Yes,Yes,Yes,No,No,No,No,No,No,No,No,Two year,No,Mailed check
767,0,20.25,,Male,0,No,Yes,Yes,No,No,No,No,No,No,No,No,Two year,No,Mailed check


In [613]:
df.loc[df.TotalSpent == ' ', 'TotalSpent'] = 0
df_test.loc[df_test.TotalSpent == ' ', 'TotalSpent'] = 0

**Подготовка данных**

In [614]:
y = df['Churn']

In [615]:
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent']

cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod']

In [616]:
df.drop(['Churn'], axis=1, inplace=True)

In [617]:
df = df.astype({'TotalSpent': np.float64})
df_test = df_test.astype({'TotalSpent': np.float64})

In [618]:
scaler = StandardScaler()
scaler_data = pd.DataFrame(scaler.fit_transform(df[num_cols]), columns=num_cols)
scaler_data_test = pd.DataFrame(scaler.fit_transform(df_test[num_cols]), columns=num_cols)

In [619]:
scaler_data_test

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent
0,0.394782,-0.274027,0.057688
1,-0.133958,1.147644,0.330847
2,-0.947404,0.401602,-0.695456
3,1.411590,0.326160,1.155787
4,1.574279,-1.481107,-0.383600
...,...,...,...
1756,1.574279,1.370619,2.344622
1757,-0.784715,-1.455959,-0.877989
1758,-1.232110,0.911258,-0.922638
1759,-0.540681,0.832463,-0.281293


In [620]:
#ohe = OneHotEncoder()
#ohe_data = pd.DataFrame(ohe.fit_transform(df[cat_cols]).toarray())
label = LabelEncoder()
label_data = df[cat_cols].apply(label.fit_transform)
label_data_test = df_test[cat_cols].apply(label.fit_transform)

In [621]:
label_data_test

Unnamed: 0,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod
0,1,0,1,0,1,1,0,1,0,0,0,0,0,1,0,1
1,1,1,1,0,1,1,1,0,0,1,0,1,1,0,1,2
2,1,0,0,0,1,1,1,0,0,0,0,0,0,0,1,2
3,0,0,1,1,1,1,0,1,1,0,1,1,0,2,0,1
4,0,0,1,1,1,0,2,0,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1756,0,0,1,0,1,1,1,0,1,1,0,1,1,2,1,0
1757,0,0,1,1,1,0,2,0,0,0,0,0,0,2,0,3
1758,1,0,1,0,1,1,1,0,0,1,0,1,0,0,1,2
1759,0,0,1,0,1,1,1,0,1,0,0,0,1,0,1,2


In [622]:
train_data = pd.concat([scaler_data, label_data], axis=1)
test_data = pd.concat([scaler_data_test, label_data_test], axis=1)

In [623]:
test_data

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod
0,0.394782,-0.274027,0.057688,1,0,1,0,1,1,0,1,0,0,0,0,0,1,0,1
1,-0.133958,1.147644,0.330847,1,1,1,0,1,1,1,0,0,1,0,1,1,0,1,2
2,-0.947404,0.401602,-0.695456,1,0,0,0,1,1,1,0,0,0,0,0,0,0,1,2
3,1.411590,0.326160,1.155787,0,0,1,1,1,1,0,1,1,0,1,1,0,2,0,1
4,1.574279,-1.481107,-0.383600,0,0,1,1,1,0,2,0,0,0,0,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1756,1.574279,1.370619,2.344622,0,0,1,0,1,1,1,0,1,1,0,1,1,2,1,0
1757,-0.784715,-1.455959,-0.877989,0,0,1,1,1,0,2,0,0,0,0,0,0,2,0,3
1758,-1.232110,0.911258,-0.922638,1,0,1,0,1,1,1,0,0,1,0,1,0,0,1,2
1759,-0.540681,0.832463,-0.281293,0,0,1,0,1,1,1,0,1,0,0,0,1,0,1,2


In [624]:
train_data.columns = train_data.columns.astype(str)
test_data.columns = test_data.columns.astype(str)

In [625]:
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.2, stratify=y)

**LogRegression**

In [632]:
params = {'penalty': ['l1', 'l2', 'elasticnet'],
          'C': [100, 10, 1, 0.1, 0.01, 0.001],
          'fit_intercept': [True, False],
          'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
          'max_iter': [100, 500, 1000]}
clf = GridSearchCV(LogisticRegression(), params, cv=10, scoring='roc_auc', verbose=2, n_jobs=-1, refit=True)
clf.fit(train_data, y)

Fitting 10 folds for each of 648 candidates, totalling 6480 fits


3600 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\markc\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\markc\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\markc\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

------------------------

In [633]:
clf.best_params_

{'C': 10,
 'fit_intercept': False,
 'max_iter': 100,
 'penalty': 'l2',
 'solver': 'newton-cholesky'}

In [634]:
clf.best_score_

0.8436200462621585

In [635]:
log = LogisticRegression(C=10, fit_intercept=False, max_iter=100, penalty='l2', solver='newton-cholesky')
log.fit(X_train, y_train)

In [402]:
y_pred = log.predict(X_test)

In [403]:
roc_auc_score(y_test, y_pred)

0.7115697630403511

**LogRegressionCV**

In [636]:
params = {'penalty': ['l1', 'l2', 'elasticnet'],
          'scoring': ['roc_auc'],
          'fit_intercept': [True, False],
          'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
          'max_iter': [100, 500, 1000]}
clf = GridSearchCV(LogisticRegressionCV(), params, cv=10, verbose=2, scoring='roc_auc', n_jobs=-1, refit=True)
clf.fit(train_data, y)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


600 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\markc\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\markc\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1762, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\markc\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------

In [637]:
clf.best_params_

{'fit_intercept': False,
 'max_iter': 500,
 'penalty': 'l1',
 'scoring': 'roc_auc',
 'solver': 'saga'}

In [638]:
clf.best_score_

0.8436105263998904

In [639]:
logcv = LogisticRegressionCV(fit_intercept=False, max_iter=500, penalty='l1', scoring='roc_auc', solver='saga')
logcv.fit(X_train, y_train)

In [408]:
y_pred = logcv.predict(X_test)

In [409]:
roc_auc_score(y_test, y_pred)

0.7139592323415853

**GradientBoostingClassifier**

In [216]:
params = {'loss': ['log_loss', 'exponential'],
          'learning_rate': np.arange(0.1, 1.1, 0.1),
          "min_samples_split": range(2, 51),
          "min_samples_leaf": range(1, 51),
          'n_estimators': [100, 250, 500, 750, 1000]}
clf = GridSearchCV(GradientBoostingClassifier(), params, cv=10, scoring='roc_auc', verbose=2, n_jobs=-1, refit=True)
clf.fit(X_train, y_train)

Fitting 10 folds for each of 245000 candidates, totalling 2450000 fits


KeyboardInterrupt: 

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [55]:
grad = GradientBoostingClassifier(learning_rate=0.1, loss='log_loss', n_estimators=100)
grad.fit(X_train, y_train)

In [56]:
y_pred = grad.predict(X_test)

In [57]:
roc_auc_score(y_test, y_pred)

0.6919977746670973

**RandomForest**

In [640]:
params = {'criterion': ['gini', 'entropy', 'log_loss'],
          'min_samples_split': [10, 20],
          'min_samples_leaf': [10, 20],
          'n_estimators': [100, 500, 1000]}
clf = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring='roc_auc', verbose=2, n_jobs=-1, refit=True)
clf.fit(train_data, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [641]:
clf.best_params_

{'criterion': 'entropy',
 'min_samples_leaf': 20,
 'min_samples_split': 10,
 'n_estimators': 500}

In [642]:
clf.best_score_

0.8446239276743659

In [643]:
rnd = RandomForestClassifier(criterion='entropy', n_estimators=500, min_samples_leaf=20, min_samples_split=10)
rnd.fit(X_train, y_train)

In [414]:
y_pred = rnd.predict(X_test)

In [415]:
roc_auc_score(y_test, y_pred)

0.6876750700280113

**CatBoostClassifier**

In [644]:
from catboost import CatBoostClassifier, Pool

In [645]:
train_data_2 = pd.concat([df[num_cols], df[cat_cols]], axis=1)

In [646]:
train_data_2

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod
0,55,19.50,1026.35,Male,0,Yes,Yes,Yes,No,No,No,No,No,No,No,No,One year,No,Mailed check
1,72,25.85,1872.20,Male,0,Yes,No,Yes,Yes,No,No,No,No,No,No,No,Two year,No,Credit card (automatic)
2,1,75.90,75.90,Male,0,No,No,Yes,No,Fiber optic,No,No,No,Yes,No,No,Month-to-month,Yes,Electronic check
3,32,79.30,2570.00,Female,1,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Mailed check
4,60,115.25,6758.45,Female,0,Yes,Yes,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5236,3,30.40,82.15,Male,0,No,No,No,No,DSL,No,No,No,Yes,No,No,Month-to-month,No,Electronic check
5237,50,44.45,2188.45,Male,0,Yes,No,No,No,DSL,Yes,No,No,Yes,Yes,No,One year,Yes,Bank transfer (automatic)
5238,1,55.05,55.05,Male,0,No,No,Yes,No,DSL,No,No,Yes,Yes,No,No,Month-to-month,No,Mailed check
5239,29,76.00,2215.25,Female,0,No,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Credit card (automatic)


In [536]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(train_data_2, y, test_size=0.1, stratify=y)

In [647]:
model = CatBoostClassifier(iterations=250,
                           l2_leaf_reg=10,
                           grow_policy='Lossguide',
                           learning_rate=0.05,
                           depth=4,
                           eval_metric="AUC:hints=skip_train~false",
                           metric_period=100,
                           min_data_in_leaf=100,
                           random_seed=0,
                           random_strength=5,
                           subsample=0.6,
                           cat_features=cat_cols)
model_2 = CatBoostClassifier(iterations=500,
                           l2_leaf_reg=12,
                           grow_policy='Lossguide',
                           learning_rate=0.04,
                           depth=4,
                           eval_metric="AUC:hints=skip_train~false",
                           metric_period=100,
                           min_data_in_leaf=100,
                           random_seed=0,
                           random_strength=5,
                           subsample=0.6,
                           cat_features=cat_cols)
model_3 = CatBoostClassifier(iterations=300,
                           l2_leaf_reg=10,
                           grow_policy='Lossguide',
                           learning_rate=0.09,
                           depth=4,
                           eval_metric="AUC:hints=skip_train~false",
                           metric_period=100,
                           min_data_in_leaf=100,
                           random_seed=0,
                           random_strength=5,
                           subsample=0.6,
                           cat_features=cat_cols)

In [648]:
model.fit(train_data, y, cat_features=cat_cols)

0:	learn: 0.7518630	total: 40.2ms	remaining: 10s
100:	learn: 0.8486600	total: 3.3s	remaining: 4.87s
200:	learn: 0.8563990	total: 6.78s	remaining: 1.65s
249:	learn: 0.8628204	total: 8.51s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1decf7f7760>

In [565]:
model_2.fit(train_data_2, y, cat_features=cat_cols)

0:	learn: 0.7518630	total: 53ms	remaining: 26.5s
100:	learn: 0.8482104	total: 4.3s	remaining: 17s
200:	learn: 0.8538858	total: 8.09s	remaining: 12s
300:	learn: 0.8626933	total: 11.3s	remaining: 7.47s
400:	learn: 0.8704537	total: 14.4s	remaining: 3.54s
499:	learn: 0.8755049	total: 17.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1dececa4cd0>

In [566]:
model_3.fit(train_data_2, y, cat_features=cat_cols)

0:	learn: 0.7518630	total: 46.4ms	remaining: 13.9s
100:	learn: 0.8563251	total: 4.22s	remaining: 8.31s
200:	learn: 0.8730877	total: 8.03s	remaining: 3.96s
299:	learn: 0.8832304	total: 11.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1decee37eb0>

In [649]:
model.best_score_

{'learn': {'Logloss': 0.39130713175687537, 'AUC': 0.8628204291437691}}

In [568]:
model_2.best_score_

{'learn': {'Logloss': 0.3758045504104031, 'AUC': 0.8755048772546821}}

In [569]:
model_3.best_score_

{'learn': {'Logloss': 0.36628762452449753, 'AUC': 0.8832303909924282}}

In [570]:
preds_class_1 = model.predict(X_test_2)
preds_class_2 = model_2.predict(X_test_2)
preds_class_3 = model_3.predict(X_test_2)

In [571]:
print(roc_auc_score(y_test_2, preds_class), roc_auc_score(y_test_2, preds_class_2), roc_auc_score(y_test_2, preds_class_3))

0.7382806593074247 0.7322130651746559 0.7445278239830637


In [552]:
test_data_2 = pd.concat([df_test[num_cols], df_test[cat_cols]], axis=1)

In [553]:
test_data_2

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod
0,42,56.10,2386.85,Male,0,Yes,No,Yes,Yes,DSL,Yes,No,No,No,No,No,One year,No,Credit card (automatic)
1,29,98.50,3004.15,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check
2,9,76.25,684.85,Male,0,No,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check
3,67,74.00,4868.40,Female,0,Yes,Yes,Yes,Yes,DSL,Yes,Yes,No,Yes,Yes,No,Two year,No,Credit card (automatic)
4,71,20.10,1389.60,Female,0,Yes,Yes,Yes,No,No,No,No,No,No,No,No,Two year,No,Bank transfer (automatic)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1756,71,105.15,7555.00,Female,0,Yes,No,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic)
1757,13,20.85,272.35,Female,0,Yes,Yes,Yes,No,No,No,No,No,No,No,No,Two year,No,Mailed check
1758,2,91.45,171.45,Male,0,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,No,Month-to-month,Yes,Electronic check
1759,19,89.10,1620.80,Female,0,Yes,No,Yes,Yes,Fiber optic,No,Yes,No,No,No,Yes,Month-to-month,Yes,Electronic check


In [650]:
meta = CatBoostClassifier(
    logging_level='Silent',
    eval_metric="AUC:hints=skip_train~false",
    metric_period=1000,
    random_seed=0,
    grow_policy="Depthwise",
    l2_leaf_reg=1,
    learning_rate=0.08,
    max_depth=10,
    min_data_in_leaf=10,
    n_estimators=10,
    random_strength=11,
    subsample=0.1)

In [651]:
stacking = StackingClassifier(
    estimators=[
        ("model", model),
        ("logcv", logcv),
        ("rnd", rnd)
    ],
    final_estimator=meta,
    n_jobs=-1
)
stacking.fit(train_data, y)

In [581]:
stack_pred = stacking.predict(X_test_2)

In [582]:
roc_auc_score(y_test_2, stack_pred)

0.7226485709965219

In [652]:
submission = pd.read_csv('./submission.csv')

In [653]:
submission

Unnamed: 0,Id,Churn
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5
...,...,...
1756,1756,0.5
1757,1757,0.5
1758,1758,0.5
1759,1759,0.5


In [654]:
submission['Churn'] = stacking.predict_proba(test_data)[:, 1]
submission.to_csv('./my_submission_stacking_2.csv', index=False)