In [15]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import fbeta_score, precision_recall_curve, confusion_matrix

In [16]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]
    
def threshold_by_fbeta(y_test: pd.Series, y_pred: list, *, beta: int = 1) -> tuple:
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    fbeta = ((1 + beta**2) * precision * recall) / (beta**2 * precision + recall)
    index = np.argmax(fbeta)
    return thresholds[index], fbeta[index]

In [17]:
#!wget 'https://drive.google.com/uc?export=download&id=1yIIxDfW7Wfq-wPlbsa0dFrSlD3r-Ai91' -O churn_data.csv

In [18]:
df = pd.read_csv("churn_data.csv")
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

### Задание 1

Для нашего пайплайна (Case1) поэкспериментировать с разными моделями:
- бустинг
- логистическая регрессия

#### Решение

In [19]:
cat_features = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
cont_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']
final_transformers = []

for cat_feat in cat_features:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_feat)),
                ('ohe', OHEEncoder(key=cat_feat))
            ])
    final_transformers.append((cat_feat, cat_transformer))
    
for cont_feat in cont_features:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_feat)),
                ('scaler', MinMaxScaler())
            ])
    final_transformers.append((cont_feat, cont_transformer))

feats = FeatureUnion(final_transformers)

classifiers = [
    GradientBoostingClassifier(random_state=42),
    LogisticRegression(random_state=42)
]

pipelines = []
metrics = {'threshold': [], 'fbeta': []}



In [20]:
for clf in classifiers:
    pipeline = Pipeline([
        ('features', feats),
        ('classifier', clf),
    ])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict_proba(X_test)[:, 1]
    threshold, fbeta_score = threshold_by_fbeta(y_test, y_pred, beta=2)
    
    metrics['threshold'].append(threshold)
    metrics['fbeta'].append(fbeta_score)
    pipelines.append(pipeline)

### Задание 2

Отобрать лучшую модель по метрикам (какая по вашему мнению здесь наиболее подходящая ML-метрика)

#### Решение

In [21]:
metrics_total = pd.DataFrame(metrics, index=[clf.__class__.__name__ for clf in classifiers])
metrics_total.sort_values('fbeta', ascending=False)

Unnamed: 0,threshold,fbeta
GradientBoostingClassifier,0.154589,0.717822
LogisticRegression,0.135475,0.636624


Наиболее подходящим оказался градиентный бустинг

### Задание 3

Для отобранной модели (на отложенной выборке) сделать оценку экономической эффективности при тех же вводных, как в вопросе 2:
- 1 доллар на удержание
- 2 доллара - с каждого правильно классифицированного (True Positive)

#### Решение

In [24]:
y_pred_best = pipelines[0].predict_proba(X_test)[:, 1]
_, _, thresholds = precision_recall_curve(y_test, y_pred)

def econ_calc(y_pred_probas, thresholds):
    profits = []
    expenses = []
    for thrld in thresholds:
        preds = (y_pred_best >= thrld).astype(bool)
        conf = confusion_matrix(y_test, preds)
        TP = conf[1][1]
        FP = conf[0][1]
        profits.append(TP * 2)
        expenses.append(FP * -1)

    results = pd.DataFrame({'threshold': thresholds, 'profit': profits, 'expense': expenses})
    results['total'] = results['profit'] + results['expense']
    return results


results = econ_calc(y_pred_best, thresholds)

In [25]:
results.sort_values('total', ascending=False)

Unnamed: 0,threshold,profit,expense,total
2178,0.407250,608,-128,480
2175,0.406141,608,-128,480
2179,0.407268,608,-128,480
2180,0.407311,608,-128,480
2177,0.406888,608,-128,480
...,...,...,...,...
4,0.014457,1014,-1960,-946
3,0.014286,1014,-1963,-949
2,0.013231,1016,-1983,-967
1,0.011778,1018,-1989,-971


Наиболее экономически выгодно значение 0.407250 с выгодой 480 долларов

### Задание 4

*Провести подбор гиперпараметров лучшей модели по итогам 2-3

#### Решение

### Задание 5

*Еще раз провести оценку экономической эффективности

#### Решение