In [44]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline,FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_recall_curve, confusion_matrix
import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


Посмотрим на распределение классов:

In [3]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

Не самое плохое распределение (1 к 4)

Построим модель. Сразу же будем работать с использованием sklearn pipeline

In [4]:
#разделим данные на train/test и Удалим признак CustomerId,Exited
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Exited','CustomerId','Surname'],axis=1), df['Exited'], random_state=0)

In [5]:
X_train

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2967,2968,579,Germany,Female,39,5,117833.30,3,0,0,5831.00
700,701,750,France,Female,32,5,0.00,2,1,0,95611.47
3481,3482,729,Spain,Female,34,9,53299.96,2,1,1,42855.97
1621,1622,689,Spain,Male,38,5,75075.14,1,1,1,8651.92
800,801,605,France,Male,52,7,0.00,2,1,1,173952.50
...,...,...,...,...,...,...,...,...,...,...,...
9225,9226,594,Germany,Female,32,4,120074.97,2,1,1,162961.79
4859,4860,794,Spain,Female,22,4,114440.24,1,1,1,107753.07
3264,3265,738,France,Male,35,5,161274.05,2,1,0,181429.87
9845,9846,590,Spain,Female,38,9,0.00,2,1,1,148750.16


- Категориальные признаки закодируем с помощью OneHotEncoding
- Вещественные оставим пока как есть

In [6]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

Зададим списки признаков

In [7]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary','RowNumber']

Создадим под каждый признак трансформер и объединим их в список.

In [8]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    
    final_transformers.append((cont_col, cont_transformer))

Объединим все это в единый пайплайн

In [11]:
feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

array([[ 0.        ,  1.        ,  0.        , ...,  2.53503394,
        -1.64080994, -0.70217614],
       [ 1.        ,  0.        ,  0.        , ...,  0.80424154,
        -0.07927152, -1.48572191],
       [ 0.        ,  0.        ,  1.        , ...,  0.80424154,
        -0.99684012, -0.52452174],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.80424154,
         1.4133552 , -0.59952369],
       [ 0.        ,  0.        ,  1.        , ...,  0.80424154,
         0.84496184,  1.67507477],
       [ 0.        ,  1.        ,  0.        , ..., -0.92655087,
         0.32500428, -0.78339945]])

Теперь у нас есть пайплайн, который готовит признаки для моделирования.

Подготовим модели градиентного бустинга и логистической регрессии

In [38]:
pipelines = {
    "GradientBoostingClassifier": Pipeline([('features', feats),
                                            ('classifier', GradientBoostingClassifier(n_estimators=100, max_depth=1, random_state=0))]),
    'LogisticRegression':Pipeline([('features', feats),
                                   ('classifier', LogisticRegression(max_iter = 100, random_state=0))]),
    'RandomForestClassifier':Pipeline([('features', feats),
                                       ('classifier', RandomForestClassifier(random_state=42))])
}

Обучим модели

In [15]:
for pipeline in pipelines:
    pipelines[pipeline].fit(X_train, y_train)    

In [16]:
# прогнозы для тестовой выборки
preds_models = {}
for pipeline in pipelines:
    preds_models[pipeline] = pipelines[pipeline].fit(X_train, y_train).predict_proba(X_test)[:, 1] 

Перейдем от вероятностей к меткам классов подобрав порог, после которого будем считать, что объект можно отнести к классу 1 (если вероятность больше порога - размечаем объект как класс 1, если нет - класс 0).

В качестве м етрики выберем F1

In [18]:
def threshold(y_test, preds,b=1):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = ((1+b**2) * precision * recall) / (b**2*precision + recall)
    ix = np.argmax(fscore)
    return {"threshold":thresholds[ix], 'f-Score':fscore[ix], 'precision':precision[ix], 'recall':recall[ix]}    

In [19]:
metrics_models = {}
for pred in preds_models:
    metrics_models[pred] = threshold(y_test, preds_models[pred])

Выведем таблицу с метриками и отсортируем по ['f-Score']

In [20]:
models = pd.DataFrame(metrics_models).T
models = models.sort_values(by=['f-Score'], ascending=False)
models

Unnamed: 0,threshold,f-Score,precision,recall
RandomForestClassifier,0.34,0.635338,0.609009,0.664047
GradientBoostingClassifier,0.3721,0.620908,0.671233,0.577603
LogisticRegression,0.273899,0.506689,0.441048,0.595285


Произведем оценку экономической эффективности если 1 доллар на удержание, 2 доллара - с каждого правильно классифицированного (True Positive)

In [21]:
def economic_metric(y_test,pred,threshold):
    cnf_matrix = confusion_matrix(y_test, pred > threshold)
    FP = cnf_matrix[0][1]
    TP = cnf_matrix[1][1]
    return (2-1)*TP-FP   

In [23]:
economic = []
for model_name in models['threshold'].keys():
    economic.append(economic_metric(y_test,preds_models[model_name],models.loc[model_name,'threshold']))

In [24]:
models['economic'] = economic

Выведем результат в таблице

In [25]:
models

Unnamed: 0,threshold,f-Score,precision,recall,economic
RandomForestClassifier,0.34,0.635338,0.609009,0.664047,127
GradientBoostingClassifier,0.3721,0.620908,0.671233,0.577603,149
LogisticRegression,0.273899,0.506689,0.441048,0.595285,-82


Проведем подбор гиперпараметров полученных моделей

In [39]:
params = {
    'RandomForestClassifier':{
        'classifier__max_features':[0.3, 0.5, 0.7],
        'classifier__min_samples_leaf':[1, 2, 3],
        'classifier__class_weight':[{0:1, 1:4}, 'balanced_subsample']
    },
    'LogisticRegression':{
        'classifier__penalty' : ['l1', 'l2'],
        'classifier__C' : np.logspace(-4, 4, 20)
    },
    "GradientBoostingClassifier":{
        'classifier__min_samples_split':range(200,1001,200),
        'classifier__max_depth':[1, 4, 7],
        'classifier__learning_rate':[0.1, 0.2, 0.3]
    }
}

In [41]:
%%time
new_param = {}
for pipeline_name in pipelines:
    print(pipelines[pipeline_name])
    grid = GridSearchCV(pipelines[pipeline_name],param_grid=params[pipeline_name],cv=6,refit=False)
    search = grid.fit(X_train, y_train)
    new_param[pipeline_name] = search.best_params_

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

120 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Dchudov\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dchudov\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Dchudov\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Dchudov\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Geography',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Geography')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Geography'))])),
                                                ('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='Gender'))])),
                                                ('Tenure',
                                                 Pipeline(steps=[('selector',
           

Обучим модели с новыми параметрами

In [42]:
new_param

{'GradientBoostingClassifier': {'classifier__learning_rate': 0.1,
  'classifier__max_depth': 4,
  'classifier__min_samples_split': 200},
 'LogisticRegression': {'classifier__C': 0.03359818286283781,
  'classifier__penalty': 'l2'},
 'RandomForestClassifier': {'classifier__class_weight': 'balanced_subsample',
  'classifier__max_features': 0.3,
  'classifier__min_samples_leaf': 2}}

In [43]:
pipelines = {
    "GradientBoostingClassifier": Pipeline([('features', feats),
                                            ('classifier', GradientBoostingClassifier(learning_rate=0.1, max_depth=4, min_samples_split=200, random_state=0))]),
    'LogisticRegression':Pipeline([('features', feats),
                                   ('classifier', LogisticRegression(max_iter = 100, C=0.0335,penalty='l2', random_state=0))]),
    'RandomForestClassifier':Pipeline([('features', feats),
                                       ('classifier', RandomForestClassifier(max_features=0.3,min_samples_leaf=2,random_state=42))])
}

In [45]:
for pipeline in pipelines:
    pipelines[pipeline].fit(X_train, y_train)    

In [46]:
# прогнозы для тестовой выборки
preds_models = {}
for pipeline in pipelines:
    preds_models[pipeline] = pipelines[pipeline].fit(X_train, y_train).predict_proba(X_test)[:, 1] 

В качестве метрики выберем F1

In [47]:
def threshold(y_test, preds,b=1):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = ((1+b**2) * precision * recall) / (b**2*precision + recall)
    ix = np.argmax(fscore)
    return {"threshold":thresholds[ix], 'f-Score':fscore[ix], 'precision':precision[ix], 'recall':recall[ix]}    

In [48]:
metrics_models = {}
for pred in preds_models:
    metrics_models[pred] = threshold(y_test, preds_models[pred])

Выведем таблицу с метриками и отсортируем по ['f-Score']

In [49]:
models = pd.DataFrame(metrics_models).T
models = models.sort_values(by=['f-Score'], ascending=False)
models

Unnamed: 0,threshold,f-Score,precision,recall
RandomForestClassifier,0.378061,0.647773,0.668058,0.628684
GradientBoostingClassifier,0.37264,0.638806,0.647177,0.630648
LogisticRegression,0.244862,0.507213,0.413366,0.656189


Произведем оценку экономической эффективности если 1 доллар на удержание, 2 доллара - с каждого правильно классифицированного (True Positive)

In [50]:
def economic_metric(y_test,pred,threshold):
    cnf_matrix = confusion_matrix(y_test, pred > threshold)
    FP = cnf_matrix[0][1]
    TP = cnf_matrix[1][1]
    return (2-1)*TP-FP   

In [51]:
economic = []
for model_name in models['threshold'].keys():
    economic.append(economic_metric(y_test,preds_models[model_name],models.loc[model_name,'threshold']))

In [52]:
models['economic'] = economic

Выведем результат в таблице

In [53]:
models

Unnamed: 0,threshold,f-Score,precision,recall,economic
RandomForestClassifier,0.378061,0.647773,0.668058,0.628684,160
GradientBoostingClassifier,0.37264,0.638806,0.647177,0.630648,145
LogisticRegression,0.244862,0.507213,0.413366,0.656189,-141


### Применим метрику  F-мера, подбирая гиперпараметр b, таким образом, что бы прибыль была максимальной

In [64]:
fvScore = {}
for pred in preds_models:
    res_threshold = {"threshold":0,"res":0,"b":0}
    b_arange = np.arange(0.1,1.1,0.1)
    for b in b_arange:
        metric = threshold(y_test, preds_models[pred],b=b)
        ix = np.argmax(metric['f-Score'])    
        cnf_matrix = confusion_matrix(y_test, preds_models[pred] > metric['threshold'])    
        FP = cnf_matrix[0][1]
        TP = cnf_matrix[1][1]
        res = economic_metric(y_test,preds_models[pred],metric['threshold'])       
        if res_threshold["res"] < res :       
            res_threshold["res"] = res
            res_threshold["threshold"] = metric['threshold']
            res_threshold["b"] = b 
            fvScore[pred] = res_threshold

In [65]:
fvScore

{'GradientBoostingClassifier': {'threshold': 0.5557712771728794,
  'res': 181,
  'b': 0.6},
 'LogisticRegression': {'threshold': 0.5136722673944952,
  'res': 35,
  'b': 0.30000000000000004},
 'RandomForestClassifier': {'threshold': 0.5799428088251617,
  'res': 178,
  'b': 0.6}}

По результатам видно, каким должен быть threshold и b, чтобы прибыль была максимальной.