Из алгоритмов машинного обучения я решил выбрать: логистическую регресию, случайный лес и градиентный бустинг.

In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics
from sklearn import pipeline as pl
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

Считаю данные.

In [2]:
churn_data = pd.read_csv('orange_small_churn_data.txt')
churn_data['churn_labels'] = pd.read_csv('orange_small_churn_labels.txt', header=None)
churn_data['churn_labels'] = churn_data['churn_labels'].apply(lambda x: 0 if x == -1 else 1)

churn_data.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,churn_labels
0,,,,,,3052.0,,,,,...,vr93T2a,LM8l689qOp,,,fKCe,02N6s8f,xwM2aC7IdeMC0,,,0
1,,,,,,1813.0,7.0,,,,...,6hQ9lNX,LM8l689qOp,,ELof,xb3V,RAYp,55YFVY9,mj86,,0
2,,,,,,1953.0,7.0,,,,...,catzS2D,LM8l689qOp,,,FSa2,ZI9m,ib5G6X1eUxUn6,mj86,,0
3,,,,,,1533.0,7.0,,,,...,e4lqvY0,LM8l689qOp,,,xb3V,RAYp,F2FyR07IdsN7I,,,1
4,,,,,,686.0,7.0,,,,...,MAz3HNj,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,0


In [3]:
churn_data['churn_labels'].value_counts()

0    37024
1     2976
Name: churn_labels, dtype: int64

Отделю от данных часть на которой в последствии проверим алгоритм на переобучение.

In [4]:
data, holdout_data = model_selection.train_test_split(churn_data.to_numpy(), test_size=0.2, random_state=0,
                                                         stratify=churn_data['churn_labels'].values)
labels = data[:,-1].astype(np.int32)
holdout_labels = holdout_data[:,-1].astype(np.int32)

In [5]:
churn_holdout = pd.DataFrame(holdout_data)
churn_holdout.to_csv('holdout_data.csv',index=False)

churn_holdout_test = pd.read_csv('holdout_data.csv')
churn_holdout_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,221,222,223,224,225,226,227,228,229,230
0,,,,0.0,,,,,,,...,76DJixu,LM8l689qOp,,,szEZ,RAYp,F2FyR07IdsN7I,,,0
1,,,,,,1134.0,7.0,,,,...,dLSJu87,LM8l689qOp,,,Xa3G,6fzt,F2FcTt7IdMT_v,,,0
2,,,,,,3059.0,21.0,,,,...,C7Jqqb8,LM8l689qOp,,ELof,Aoh3,RAYp,F2FyR07IdsN7I,am7c,,0
3,,,,,,945.0,14.0,,,,...,catzS2D,M_8D,,ELof,Xa3G,ZI9m,ib5G6X1eUxUn6,am7c,,0
4,,,,,,749.0,7.0,,,,...,eIi2qo0,LM8l689qOp,,,PM2D,RAYp,F2FyR07IdsN7I,,,0


Отброшу признаки, значения которых на всех объектах nan, признаки с неуникальными значениями, и признаки линейнозависящие от других признаков. Далее разобью признаки на числовые и категориальные.

In [9]:
indices = np.arange(230)
indices = indices[~(churn_data.iloc[:,:-1].isnull().all() + churn_data.iloc[:,:-1].nunique() == 1)]
line_dependent = np.array([65,155,90,147,127,104,221,213])
indices = np.setdiff1d(indices, line_dependent)

In [10]:
def split_indices_of_feature(indices):
    """ Split indices on indices of numerical features and indices of categorial features.
    
        Keyword arguments:
            indices -- 1-d array
        Returns:
            numeric indices -- 1-d array, consisting of indexes of numerical features.
            categoric indices -- 1-d array, consisting of indexes of categorical features.
    """
    return (indices[indices < 190], indices[indices>=190])

In [11]:
num_indices, cat_indices = split_indices_of_feature(indices)

Построю pipeline для логистической регрессии.

In [12]:
log_reg = pl.Pipeline(steps=[
    ('feature_processing', pl.FeatureUnion(transformer_list=[
        #numeric
        ('numeric_variable_processing',pl.Pipeline(steps=[
            ('selecting', FunctionTransformer(lambda data: data[:,num_indices],validate=False)),
            ('imputing_nan_values', SimpleImputer(missing_values=np.nan,strategy='median',fill_value=0)),
            ('converting', FunctionTransformer(lambda data: data.astype(np.float64),validate=False)),
            ('scaling', MinMaxScaler())
        ])),
        #categorial
        ('categorial_variable_processing',pl.Pipeline(steps=[
            ('selecting', FunctionTransformer(lambda data: data[:,cat_indices],validate=False)),
            ('imputing_nan_values', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='na')),
            ('encoding', ce.CatBoostEncoder())
        ]))
    ])),
    ('classifier', LogisticRegression(solver='lbfgs',max_iter=1000, class_weight='balanced'))
])

Разобью данные 10 раз на 5 фолдов с сохранением баланса классов, посчитаю roc_auc, по roc кривой подберу оптимальный порог и вычислю все метрики на каждом фолде. Выведу среднее метрик по всем фолдам.

In [13]:
def get_cross_val_scores(estimator,data,labels):
    """ Evaluate the following merics: auc, precission, recall and f1 by Stratified 5-Folds cross-validation

        Keyword arguments:
            estimator -- estimator object implementing ‘fit’

            data -- array-like of shape (n_samples, n_features)
            The data to fit.

            labels -- array-like of shape(n_samples)
            The target variable to try to predict.
        Returns:
            auc_scores -- 1-d array of shape (50,) 
            Array of auc scores of the estimator for each run of the cross validation.
            f_scores -- 1-d array of shape (50,) 
            Array of f1 scores of the estimator for each run of the cross validation.
            pr_scores -- 1-d array of shape (50,) 
            Array of precission scores of the estimator for each run of the cross validation.
            rec_scores -- 1-d array of shape (50,) 
            Array of recall scores of the estimator for each run of the cross validation.
    """
    auc_scores = []
    thresholds = []
    pr_scores = []
    rec_scores = []
    f_scores = []
    for i in range(10):
        skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
        for train_indices, test_indices in skf.split(data, labels):
            estimator.fit(data[train_indices,:], labels[train_indices])
            auc_scores.append(metrics.roc_auc_score(labels[test_indices], 
                                        estimator.predict_proba(data[test_indices,:])[:,1]))

            fpr, tpr, thr = metrics.roc_curve(labels[test_indices], 
                                        estimator.predict_proba(data[test_indices,:])[:,1])
            norms = np.array([(np.linalg.norm([0,1] - np.array([f, t]))) for t, f in zip(tpr,fpr)])
            thresholds.append(thr[np.argmin(norms)])

            f_scores.append(metrics.f1_score(labels[test_indices],
                                   [0 if p <thr[np.argmin(norms)] else 1 
                                    for p in estimator.predict_proba(data[test_indices,:])[:,1]]))
            pr_scores.append(metrics.precision_score(labels[test_indices],
                                   [0 if p <thr[np.argmin(norms)] else 1 
                                    for p in estimator.predict_proba(data[test_indices,:])[:,1]]))
            rec_scores.append(metrics.recall_score(labels[test_indices],
                                   [0 if p <thr[np.argmin(norms)] else 1 
                                    for p in estimator.predict_proba(data[test_indices,:])[:,1]]))
    return (auc_scores, f_scores, pr_scores, rec_scores)

In [14]:
auc_scores, f_scores, pr_scores, rec_scores = get_cross_val_scores(log_reg,data,labels)
print('auc_scores = %0.4f, f_scores = %0.4f, pr_scores = %0.4f, rec_scores = %0.4f' %
     (np.mean(auc_scores), np.mean(f_scores), np.mean(pr_scores), np.mean(rec_scores)))

auc_scores = 0.6845, f_scores = 0.2057, pr_scores = 0.1226, rec_scores = 0.6416


Построю pipeline для случайного леса и выведу среднее метрик на кроссвалидации.

In [15]:
rand_forest = pl.Pipeline(steps=[
    ('feature_processing', pl.FeatureUnion(transformer_list=[
        #numeric
        ('numeric_variable_processing',pl.Pipeline(steps=[
            ('selecting', FunctionTransformer(lambda data: data[:,num_indices],validate=False)),
            ('imputing_nan_values', SimpleImputer(missing_values=np.nan,strategy='mean',fill_value=0)),
            ('converting', FunctionTransformer(lambda data: data.astype(np.float64),validate=False))
        ])),
        #categorial
        ('categorial_variable_processing',pl.Pipeline(steps=[
            ('selecting', FunctionTransformer(lambda data: data[:,cat_indices],validate=False)),
            ('imputing_nan_values', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='na')),
            ('encoding', ce.CatBoostEncoder())
        ]))
    ])),
    ('classifier', RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample'))
])

In [19]:
%%time
auc_scores, f_scores, pr_scores, rec_scores = get_cross_val_scores(rand_forest,data,labels)
print('auc_scores = %0.4f, f_scores = %0.4f, pr_scores = %0.4f, rec_scores = %0.4f' %
     (np.mean(auc_scores), np.mean(f_scores), np.mean(pr_scores), np.mean(rec_scores)))

auc_scores = 0.6905, f_scores = 0.2132, pr_scores = 0.1284, rec_scores = 0.6323
Wall time: 26min 34s


Построю pipeline для градиентого бустинга и выведу среднее метрик.

In [17]:
xgb_clf = pl.Pipeline(steps=[
    ('feature_processing', pl.FeatureUnion(transformer_list=[
        #numeric
        ('numeric_variable_processing',pl.Pipeline(steps=[
            ('selecting', FunctionTransformer(lambda data: data[:,num_indices],validate=False)),
            ('imputing_nan_values', SimpleImputer(missing_values=np.nan,strategy='mean',fill_value=0)),
            ('converting', FunctionTransformer(lambda data: data.astype(np.float64),validate=False))
        ])),
        #categorial
        ('categorial_variable_processing',pl.Pipeline(steps=[
            ('selecting', FunctionTransformer(lambda data: data[:,cat_indices],validate=False)),
            ('imputing_nan_values', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='na')),
            ('encoding', ce.CountEncoder(min_group_size=0.1))
        ]))
    ])),
    ('classifier', XGBClassifier(scale_pos_weight=(len(labels)-sum(labels))/sum(labels),n_jobs=4))
])

In [20]:
auc_scores, f_scores, pr_scores, rec_scores = get_cross_val_scores(xgb_clf,data,labels)
print('auc_scores = %0.4f, f_scores = %0.4f, pr_scores = %0.4f, rec_scores = %0.4f' %
     (np.mean(auc_scores), np.mean(f_scores), np.mean(pr_scores), np.mean(rec_scores)))

auc_scores = 0.7376, f_scores = 0.2416, pr_scores = 0.1475, rec_scores = 0.6716


Лучшие показатели у классификатора в основе которого алгоритм градиентного бустинга.