1. Для нашего пайплайна (Case1) поэкспериментировать с разными моделями: 1 - бустинг, 2 - логистическая регрессия (не забудьте здесь добавить в cont_transformer стандартизацию - нормирование вещественных признаков)

In [12]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
import itertools
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("churn_data.csv")
df = df.drop(columns=['CustomerId'])
df.head(3)

Unnamed: 0,RowNumber,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Exited']), df['Exited'], random_state=0)

In [4]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column


    def fit(self, X, y=None):
        return self


    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key


    def fit(self, X, y=None):
        return self


    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []


    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self


    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [5]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [6]:
final_transformers = []
for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scaler', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))

In [7]:
feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])

2. Отобрать лучшую модель по метрикам (кстати, какая по вашему мнению здесь наиболее подходящая DS-метрика)

In [None]:
pipeline_gb = Pipeline([
    ('features',feats),
    ('classifier', GradientBoostingClassifier(random_state = 42)),
])
pipeline_gb.fit(X_train, y_train)
preds = pipeline_gb.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
roc_auc = roc_auc_score(y_test, preds)
df_metrics = {
    'model': type(pipeline_gb['classifier']).__name__,
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc}
df_metrics = pd.DataFrame.from_dict(df_metrics, orient='index').T
df_metrics

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,GradientBoostingClassifier,0.408508,0.646121,0.703704,0.59725,0.875746


In [None]:
pipeline_lr = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = 42)),
])
pipeline_lr.fit(X_train, y_train)
preds = pipeline_lr.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
roc_auc = roc_auc_score(y_test, preds)
df_metrics_lr = {
    'model': type(pipeline_lr['classifier']).__name__,
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc}
df_metrics_lr = pd.DataFrame.from_dict(df_metrics_lr, orient='index').T
df_metrics = pd.concat([df_metrics, df_metrics_lr], axis = 0).reset_index(drop=True)

In [None]:
df_metrics.sort_values('F-Score')

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
1,LogisticRegression,0.289522,0.5097,0.4624,0.56778,0.772077
0,GradientBoostingClassifier,0.408508,0.646121,0.703704,0.59725,0.875746


3. Для отобранной модели (на отложенной выборке) сделать оценку экономической эффективности при тех же вводных, как в вопросе 2 (1 доллар на привлечение, 2 доллара - с каждого правильно классифицированного (True Positive) удержанного). (подсказка) нужно посчитать FP/TP/FN/TN для выбранного оптимального порога вероятности и посчитать выручку и траты. 

In [13]:
preds = pipeline_gb.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
cnf_matrix = confusion_matrix(y_test, preds > thresholds[ix])

In [14]:
TN = cnf_matrix[0][0]
FP = cnf_matrix[0][1]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
retain_sum = (FP + TP) * 1
income = TP * 2
income - retain_sum

175

4. (опционально) Провести подбор гиперпараметров лучшей модели по итогам 2-3

In [15]:
params = {
    'classifier__max_features': [0.3, 0.5, 0.7],
    'classifier__min_samples_leaf': [1, 15, 30, 50],
    'classifier__n_estimators': [50, 100, 150, 300]
}

In [16]:
%%time
grid = GridSearchCV(pipeline_gb,
                    param_grid=params,
                    cv=5,
                    scoring='recall',
                    refit=False)

search = grid.fit(X_train, y_train)
search.best_params_

CPU times: user 2min 12s, sys: 22.4 ms, total: 2min 12s
Wall time: 2min 12s


{'classifier__max_features': 0.7,
 'classifier__min_samples_leaf': 1,
 'classifier__n_estimators': 300}

In [17]:
final_pipline_gb = Pipeline([
    ('features', feats),
    ('classifier', GradientBoostingClassifier(n_estimators=300,
                                              min_samples_leaf=1,
                                              max_features=0.7,
                                              random_state=42)),
])
final_pipline_gb.fit(X_train, y_train)
preds = final_pipline_gb.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')                                                                        

Best Threshold=0.4333908349794679, F-Score=0.645, Precision=0.700, Recall=0.597


5. (опционально) Еще раз провести оценку экономической эффективности

In [18]:
preds = final_pipline_gb.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
cnf_matrix = confusion_matrix(y_test, preds > thresholds[ix])

In [19]:
TN = cnf_matrix[0][0]
FP = cnf_matrix[0][1]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]
retain_sum = (FP + TP) * 1
income = TP * 2
income - retain_sum

173