In [79]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import warnings
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')

Ссылка на контест:
https://www.kaggle.com/c/advanced-dls-spring-2021/

### Обработка данных

In [80]:
col = ['Sex',
       'IsSeniorCitizen', 'HasPartner', 'HasChild', 'HasPhoneService',
       'HasMultiplePhoneNumbers', 'HasInternetService',
       'HasOnlineSecurityService', 'HasOnlineBackup', 'HasDeviceProtection',
       'HasTechSupportAccess', 'HasOnlineTV', 'HasMovieSubscription',
       'HasContractPhone', 'IsBillingPaperless', 'PaymentMethod']

In [81]:
# Отчищаем TotalSpent от пустых значений
train.TotalSpent.replace(' ', '0', inplace=True)

In [82]:
# Определяем тестовые и тренировочные данные и правим их с помощью one-hot кодирования
X = pd.get_dummies(train, columns=col).drop(columns='Churn')
y = train.Churn

In [83]:
# Переводим числовые данные в float
X.ClientPeriod = X.ClientPeriod.apply(lambda x: float(x))
X.MonthlySpending = X.MonthlySpending.apply(lambda x: float(x))
X.TotalSpent = X.TotalSpent.apply(lambda x: float(x))

In [84]:
# Нормируем числовые признаки (первые 3 колонки)
X.ClientPeriod = (X.ClientPeriod - X.ClientPeriod.min()) / (X.ClientPeriod.max() - X.ClientPeriod.min())
X.MonthlySpending = (X.MonthlySpending - X.MonthlySpending.min()) / (X.MonthlySpending.max() - X.MonthlySpending.min())
X.TotalSpent = (X.TotalSpent - X.TotalSpent.min()) / (X.TotalSpent.max() - X.TotalSpent.min())

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Создание лучшей модели

In [30]:
# Создание модели случайного леса 
clf = RandomForestClassifier()

params = {'n_estimators':range(5, 15, 2),
          'max_depth':range(5, 25, 5), 
          'min_samples_leaf':range(5, 15, 2), 
          'min_samples_split':range(0, 5)}


rf = GridSearchCV(clf, params, n_jobs=-1)

In [31]:
rf.fit(X_train, y_train)

In [226]:
rf.best_params_

{'max_depth': 10,
 'min_samples_leaf': 9,
 'min_samples_split': 4,
 'n_estimators': 11}

In [86]:
clf = RandomForestClassifier(max_depth=10, min_samples_leaf=9, min_samples_split=7, n_estimators=10)
clf.fit(X_train, y_train)
y_predictions = clf.predict(X_test)

In [87]:
clf.score(X_test, y_test)

0.7776726584673604

In [88]:
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

0.8160180456405094

### Пробуем логистическую регрессию

In [40]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from catboost import CatBoostClassifier

In [242]:
clf = LogisticRegressionCV(scoring='roc_auc')

params = {'penalty': ['l1', 'l2'], 
          'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga']}

cv = GridSearchCV(clf, 
                  param_grid=params,
                  scoring='roc_auc',
                  n_jobs=-1)

cv.fit(X_train, y_train)

GridSearchCV(estimator=LogisticRegressionCV(scoring='roc_auc'), n_jobs=-1,
             param_grid={'penalty': ['l1', 'l2'],
                         'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag',
                                    'saga']},
             scoring='roc_auc')

In [41]:
# Используем лучшую модель

clf = LogisticRegressionCV(scoring='roc_auc', penalty='l2', solver='saga')
clf.fit(X_train, y_train)

LogisticRegressionCV(scoring='roc_auc', solver='newton-cg')

In [42]:
y_predictions = clf.predict(X_test)

In [43]:
clf.score(X_test, y_test)

0.8267793980476723

In [44]:
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

0.8267793980476723

### Catboost

In [96]:
X = train.drop(columns='Churn')
y = train.Churn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

catb_clf = CatBoostClassifier()
params = {'iterations':range(500, 1000, 100),
          'depth':range(0, 10, 2)}

cv_catb_clf = GridSearchCV(catb_clf, 
                           param_grid=params,
                           scoring='roc_auc',
                           n_jobs=-1)

In [97]:
cv_catb_clf.fit(X_train, y_train, cat_features=col)

Learning rate set to 0.02339
0:	learn: 0.6817747	total: 144ms	remaining: 1m 55s
1:	learn: 0.6719308	total: 150ms	remaining: 59.7s
2:	learn: 0.6626070	total: 155ms	remaining: 41.1s
3:	learn: 0.6529411	total: 159ms	remaining: 31.6s
4:	learn: 0.6435522	total: 165ms	remaining: 26.2s
5:	learn: 0.6344714	total: 170ms	remaining: 22.5s
6:	learn: 0.6272110	total: 175ms	remaining: 19.8s
7:	learn: 0.6199135	total: 180ms	remaining: 17.9s
8:	learn: 0.6123089	total: 186ms	remaining: 16.3s
9:	learn: 0.6049571	total: 192ms	remaining: 15.1s
10:	learn: 0.5977214	total: 197ms	remaining: 14.1s
11:	learn: 0.5908358	total: 202ms	remaining: 13.3s
12:	learn: 0.5855471	total: 208ms	remaining: 12.6s
13:	learn: 0.5794476	total: 213ms	remaining: 12s
14:	learn: 0.5739165	total: 219ms	remaining: 11.4s
15:	learn: 0.5684369	total: 224ms	remaining: 11s
16:	learn: 0.5641962	total: 229ms	remaining: 10.6s
17:	learn: 0.5604096	total: 234ms	remaining: 10.2s
18:	learn: 0.5554119	total: 239ms	remaining: 9.83s
19:	learn: 0.55

187:	learn: 0.4090621	total: 1.09s	remaining: 3.56s
188:	learn: 0.4089819	total: 1.1s	remaining: 3.56s
189:	learn: 0.4089095	total: 1.1s	remaining: 3.55s
190:	learn: 0.4089000	total: 1.11s	remaining: 3.54s
191:	learn: 0.4088071	total: 1.12s	remaining: 3.54s
192:	learn: 0.4087396	total: 1.12s	remaining: 3.53s
193:	learn: 0.4085533	total: 1.13s	remaining: 3.52s
194:	learn: 0.4084500	total: 1.14s	remaining: 3.52s
195:	learn: 0.4083785	total: 1.14s	remaining: 3.51s
196:	learn: 0.4081860	total: 1.15s	remaining: 3.51s
197:	learn: 0.4080944	total: 1.15s	remaining: 3.5s
198:	learn: 0.4080227	total: 1.16s	remaining: 3.49s
199:	learn: 0.4079611	total: 1.16s	remaining: 3.49s
200:	learn: 0.4078935	total: 1.17s	remaining: 3.48s
201:	learn: 0.4077526	total: 1.17s	remaining: 3.48s
202:	learn: 0.4076815	total: 1.18s	remaining: 3.47s
203:	learn: 0.4075799	total: 1.18s	remaining: 3.46s
204:	learn: 0.4075161	total: 1.19s	remaining: 3.45s
205:	learn: 0.4074536	total: 1.19s	remaining: 3.44s
206:	learn: 0.4

363:	learn: 0.4006365	total: 2.01s	remaining: 2.41s
364:	learn: 0.4006309	total: 2.02s	remaining: 2.41s
365:	learn: 0.4006078	total: 2.02s	remaining: 2.4s
366:	learn: 0.4005592	total: 2.03s	remaining: 2.39s
367:	learn: 0.4004894	total: 2.03s	remaining: 2.39s
368:	learn: 0.4004757	total: 2.04s	remaining: 2.38s
369:	learn: 0.4004602	total: 2.04s	remaining: 2.38s
370:	learn: 0.4004354	total: 2.05s	remaining: 2.37s
371:	learn: 0.4004039	total: 2.06s	remaining: 2.37s
372:	learn: 0.4003845	total: 2.06s	remaining: 2.36s
373:	learn: 0.4003621	total: 2.07s	remaining: 2.35s
374:	learn: 0.4003593	total: 2.07s	remaining: 2.35s
375:	learn: 0.4003405	total: 2.08s	remaining: 2.34s
376:	learn: 0.4003312	total: 2.08s	remaining: 2.33s
377:	learn: 0.4003172	total: 2.09s	remaining: 2.33s
378:	learn: 0.4002565	total: 2.09s	remaining: 2.32s
379:	learn: 0.4002471	total: 2.1s	remaining: 2.32s
380:	learn: 0.4002450	total: 2.1s	remaining: 2.31s
381:	learn: 0.4001788	total: 2.11s	remaining: 2.31s
382:	learn: 0.4

537:	learn: 0.3956286	total: 2.92s	remaining: 1.42s
538:	learn: 0.3956220	total: 2.92s	remaining: 1.42s
539:	learn: 0.3955930	total: 2.93s	remaining: 1.41s
540:	learn: 0.3955855	total: 2.94s	remaining: 1.41s
541:	learn: 0.3955717	total: 2.94s	remaining: 1.4s
542:	learn: 0.3955654	total: 2.94s	remaining: 1.39s
543:	learn: 0.3955560	total: 2.95s	remaining: 1.39s
544:	learn: 0.3955524	total: 2.96s	remaining: 1.38s
545:	learn: 0.3955458	total: 2.96s	remaining: 1.38s
546:	learn: 0.3954995	total: 2.96s	remaining: 1.37s
547:	learn: 0.3954933	total: 2.97s	remaining: 1.36s
548:	learn: 0.3954874	total: 2.98s	remaining: 1.36s
549:	learn: 0.3954852	total: 2.98s	remaining: 1.35s
550:	learn: 0.3954470	total: 2.98s	remaining: 1.35s
551:	learn: 0.3953844	total: 2.99s	remaining: 1.34s
552:	learn: 0.3953715	total: 3s	remaining: 1.34s
553:	learn: 0.3953657	total: 3s	remaining: 1.33s
554:	learn: 0.3953557	total: 3s	remaining: 1.33s
555:	learn: 0.3953093	total: 3.01s	remaining: 1.32s
556:	learn: 0.3952651	

717:	learn: 0.3925258	total: 3.84s	remaining: 439ms
718:	learn: 0.3925222	total: 3.85s	remaining: 433ms
719:	learn: 0.3924915	total: 3.85s	remaining: 428ms
720:	learn: 0.3924891	total: 3.86s	remaining: 423ms
721:	learn: 0.3924849	total: 3.86s	remaining: 417ms
722:	learn: 0.3924561	total: 3.87s	remaining: 412ms
723:	learn: 0.3924537	total: 3.87s	remaining: 407ms
724:	learn: 0.3924355	total: 3.88s	remaining: 401ms
725:	learn: 0.3924331	total: 3.88s	remaining: 396ms
726:	learn: 0.3924053	total: 3.89s	remaining: 390ms
727:	learn: 0.3924047	total: 3.89s	remaining: 385ms
728:	learn: 0.3923779	total: 3.9s	remaining: 380ms
729:	learn: 0.3923740	total: 3.9s	remaining: 374ms
730:	learn: 0.3923386	total: 3.91s	remaining: 369ms
731:	learn: 0.3923302	total: 3.91s	remaining: 363ms
732:	learn: 0.3923290	total: 3.92s	remaining: 358ms
733:	learn: 0.3923218	total: 3.92s	remaining: 353ms
734:	learn: 0.3922995	total: 3.93s	remaining: 347ms
735:	learn: 0.3922989	total: 3.93s	remaining: 342ms
736:	learn: 0.

GridSearchCV(estimator=<catboost.core.CatBoostClassifier object at 0x0000026BE1CECE80>,
             n_jobs=-1,
             param_grid={'depth': range(0, 10, 2),
                         'iterations': range(500, 1000, 100)},
             scoring='roc_auc')

In [98]:
cv_catb_clf.best_params_ # {'depth': 2, 'iterations': 800}

{'depth': 2, 'iterations': 800}

In [99]:
cv_catb_clf.score(X_test, y_test)

0.8291366906474821

In [100]:
roc_auc_score(y_test, cv_catb_clf.predict_proba(X_test)[:, 1])

0.8291366906474821

### Submit

In [109]:
test = pd.read_csv('test.csv')
X = pd.get_dummies(test, columns=col)

In [110]:
# Отчищаем TotalSpent от пустых значений
X.TotalSpent.replace(' ', '0', inplace=True)
# Переводим числовые данные в float
X.ClientPeriod = X.ClientPeriod.apply(lambda x: float(x))
X.MonthlySpending = X.MonthlySpending.apply(lambda x: float(x))
X.TotalSpent = X.TotalSpent.apply(lambda x: float(x))
# Нормируем числовые признаки (первые 3 колонки)
X.ClientPeriod = (X.ClientPeriod - X.ClientPeriod.min()) / (X.ClientPeriod.max() - X.ClientPeriod.min())
X.MonthlySpending = (X.MonthlySpending - X.MonthlySpending.min()) / (X.MonthlySpending.max() - X.MonthlySpending.min())
X.TotalSpent = (X.TotalSpent - X.TotalSpent.min()) / (X.TotalSpent.max() - X.TotalSpent.min())

In [111]:
y_test_predicted = cv_catb_clf.predict_proba(test)[:, 1]

In [112]:
output = pd.DataFrame({'Id': test.index, 'Churn': y_test_predicted})
output.to_csv('submission.csv', index=False)

In [94]:
sub = pd.read_csv('submission.csv')