In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve, confusion_matrix

In [2]:
data = pd.read_csv('churn_data.csv')
data.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
data['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data, data['Exited'], random_state=41)

In [5]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [6]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [7]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))

In [8]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

### Задание 1

#### Бустинг

In [9]:
model_gbc = GradientBoostingClassifier(random_state=41)

In [10]:
pipeline = Pipeline([('features', feats), ('model', model_gbc)])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [11]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.0206422 , 0.11632881, 0.0554799 , 0.35001087, 0.03719505,
       0.8359586 , 0.92413238, 0.12827656, 0.11757951, 0.02270388])

In [12]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix_gbc = np.argmax(fscore)
results_gbc = (thresholds[ix_gbc], fscore[ix_gbc], precision[ix_gbc], recall[ix_gbc])
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (results_gbc))

Best Threshold=0.324760, F-Score=0.632, Precision=0.644, Recall=0.620


#### Логистическая регрессия

In [13]:
model_lr = LogisticRegression(solver='lbfgs', random_state=41)

In [14]:
pipeline = Pipeline([('features', feats), ('model', model_lr)])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [15]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.04315347, 0.23444506, 0.06441417, 0.44576147, 0.09283974,
       0.6284325 , 0.36115611, 0.23748497, 0.1849426 , 0.16899636])

In [16]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix_lr = np.argmax(fscore)
results_lr = (thresholds[ix_lr], fscore[ix_lr], precision[ix_lr], recall[ix_lr])
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (results_lr))

Best Threshold=0.272151, F-Score=0.515, Precision=0.465, Recall=0.577


### Задание 2

In [17]:
results = pd.DataFrame([results_gbc, results_lr], columns=['Best Threshold', 'F-Score', 'Precision', 'Recall'])

results['models'] = ['GradientBoostingClassifier', 'LogisticRegression']
results = results.set_index('models')

In [18]:
results.sort_values(by='F-Score', axis=0, ascending=False)

Unnamed: 0_level_0,Best Threshold,F-Score,Precision,Recall
models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoostingClassifier,0.32476,0.631579,0.643725,0.619883
LogisticRegression,0.272151,0.515231,0.465409,0.576998


#### Видим, что у бустинга все метрики выше. У логистической регрессии Recall сильно выше Precision. Recall означает, что данную долю клиентов можно попытаться удержать (бустинг: ~64%, логистическая регрессия: ~46.5%), предложив специальные условия, чтобы они не ушли; при этом Precision показывает, что из всех клиентов, которые уходят и получат специальные предложения, обратная доля клиентов и так удовлетворена обсуживанием и не нуждается в удержании (бустинг: ~38%, логистическая регрессия: ~42.3%). Precision у логистической регрессии довольно низкий, соответственно, модель делает много неверных Positive классификаций. Основываясь на этих данных, лучше выбрать бустинг, так как мы хотим понять, сколько человек действительно может уйти, не предлагая специальные условия тем, кому это будет не нужно, что невыгодно.

### Задание 3

In [19]:
# GradientBoostingClassifier

cnf_matrix_gbc = confusion_matrix(y_test, preds>thresholds[ix_gbc])
print(cnf_matrix_gbc)

[[1659  328]
 [ 224  289]]


In [20]:
print(f'TN = {cnf_matrix_gbc[0][0]}, FP = {cnf_matrix_gbc[0][1]}\nFN = {cnf_matrix_gbc[1][0]}, TP = {cnf_matrix_gbc[1][1]}')

TN = 1659, FP = 328
FN = 224, TP = 289


In [21]:
# LogisticRegression

cnf_matrix_lr = confusion_matrix(y_test, preds>thresholds[ix_lr])
print(cnf_matrix_lr)

[[1647  340]
 [ 218  295]]


In [22]:
print(f'TN = {cnf_matrix_lr[0][0]}, FP = {cnf_matrix_lr[0][1]}\nFN = {cnf_matrix_lr[1][0]}, TP = {cnf_matrix_lr[1][1]}')

TN = 1647, FP = 340
FN = 218, TP = 295


In [23]:
TP_gbc = cnf_matrix_gbc[1, 1]
FP_gbc = cnf_matrix_gbc[0, 1]
TP_lr = cnf_matrix_lr[1, 1]
FP_lr = cnf_matrix_lr[0, 1]

#### Считаем по формуле: Profit = (TP * 2) - (TP + FP)

In [24]:
# GradientBoostingClassifier

profit_gbc = (TP_gbc * 2) - (TP_gbc + FP_gbc)
profit_gbc

-39

In [25]:
# LogisticRegression

profit_lr = (TP_lr * 2) - (TP_lr + FP_lr)
profit_lr

-45

#### Видим, что убытки в любом случае, но если использовать бустинг, то их меньше.