In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/churn-data/churn_data.csv


In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve, confusion_matrix

In [3]:
data = pd.read_csv('../input/churn-data/churn_data.csv')
data.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
data['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, data['Exited'], random_state = 50)

In [6]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [7]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [8]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))

In [9]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

### Задание №1

#### Boosting

In [10]:
model_gbc = GradientBoostingClassifier(random_state=50)

In [11]:
pipeline = Pipeline([('features', feats), ('model', model_gbc)])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [12]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.54667745, 0.03412696, 0.30911914, 0.21279085, 0.07268106,
       0.08764192, 0.01234402, 0.07340383, 0.94748746, 0.0531532 ])

In [13]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix_gbc = np.argmax(fscore)
results_gbc = (thresholds[ix_gbc], fscore[ix_gbc], precision[ix_gbc], recall[ix_gbc])
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (results_gbc))

Best Threshold=0.312227, F-Score=0.601, Precision=0.585, Recall=0.617


#### Logistic Regression

In [14]:
model_lr = LogisticRegression(solver='lbfgs', random_state=50)

In [15]:
pipeline = Pipeline([('features', feats), ('model', model_lr)])
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

In [16]:
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.65021462, 0.10354659, 0.22985545, 0.11526717, 0.19874392,
       0.13662433, 0.02553101, 0.2783181 , 0.86177097, 0.07070902])

In [17]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix_lr = np.argmax(fscore)
results_lr = (thresholds[ix_lr], fscore[ix_lr], precision[ix_lr], recall[ix_lr])
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (results_lr))

Best Threshold=0.340264, F-Score=0.476, Precision=0.475, Recall=0.477


### Задание №2

In [18]:
results = pd.DataFrame([results_gbc, results_lr], columns=['Best Threshold', 'F-Score', 'Precision', 'Recall'])

results['models'] = ['GradientBoostingClassifier', 'LogisticRegression']
results = results.set_index('models')

In [19]:
results.sort_values(by='F-Score', axis=0, ascending=False)

Unnamed: 0_level_0,Best Threshold,F-Score,Precision,Recall
models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GradientBoostingClassifier,0.312227,0.600583,0.585227,0.616766
LogisticRegression,0.340264,0.476096,0.475149,0.477046


**В результате мы видим что бустинг показывает более высокий метки. У логистической регрессии можно заметить что Recall значительно превышает Precision. 
Как следствие можно сделать следующий вывод: Recall в данном случае говорит о том что данную долю клиентов можно попытаться удержать (бустинг ~58,5%, логистическая регрессия ~47,5%) предложив например специальные условия для удержания последних.
При этом Precision показывает, что для всех клиентов, которые уходят и получают при этом спецпредложения, обратная доля клиентов удовлетворена услугой и не нуждается в удержании (бустинг ~38,3%, логистическая регрессия ~52,3%) как следствие дополнительные финансовые вливания не являются необходимыми. 
Precision в случае использования логистической регрессии достаточно низкий, а значит модель делает Positive классификаций. Исходя из полученных данных можно сделать следующий вывод, что лучше использовать бустинг, так как наша задача - как можно более точно определить количество клиентов которые нас покинут. Оптимизируя финансовые затраты на привлечение клиентов которые всё равно так уйдут.**

### Задание №3

In [20]:
# GradientBoostingClassifier

cnf_matrix_gbc = confusion_matrix(y_test, preds>thresholds[ix_gbc])
print(cnf_matrix_gbc)

[[1557  442]
 [ 218  283]]


In [21]:
print(f'TN = {cnf_matrix_gbc[0][0]}, FP = {cnf_matrix_gbc[0][1]}\nFN = {cnf_matrix_gbc[1][0]}, TP = {cnf_matrix_gbc[1][1]}')

TN = 1557, FP = 442
FN = 218, TP = 283


In [22]:
# LogisticRegression

cnf_matrix_lr = confusion_matrix(y_test, preds>thresholds[ix_lr])
print(cnf_matrix_lr)

[[1735  264]
 [ 263  238]]


In [23]:
print(f'TN = {cnf_matrix_lr[0][0]}, FP = {cnf_matrix_lr[0][1]}\nFN = {cnf_matrix_lr[1][0]}, TP = {cnf_matrix_lr[1][1]}')

TN = 1735, FP = 264
FN = 263, TP = 238


In [24]:
TP_gbc = cnf_matrix_gbc[1, 1]
FP_gbc = cnf_matrix_gbc[0, 1]
TP_lr = cnf_matrix_lr[1, 1]
FP_lr = cnf_matrix_lr[0, 1]

**Произведём расчёт по формуле: Profit=(TP*2)-(TP+FP)**

In [25]:
# GradientBoostingClassifier

profit_gbc = (TP_gbc * 2) - (TP_gbc + FP_gbc)
profit_gbc

-159

In [26]:
# LogisticRegression

profit_lr = (TP_lr * 2) - (TP_lr + FP_lr)
profit_lr

-26