In [1]:
import random
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor # работает хуже
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

<h2>Preprocessing the data</h2>

In [2]:
def preprocessing(df):
    """
    Функция предобработки признаков
    Принимает pandas.DataFrame 
    Отдает pandas.DataFrame
    
    """
    # выбросим ненужные признаки, неинформативные/ухудшающие качество модели
    cols = ['CustomerID', 'Count', 
            'Country', 'State', 
            'Lat Long', 'City']
    df.drop(cols, axis=1, inplace = True)
    
    # обработка бинарных категориальных признаков
    df['Gender'].replace({'Female':1,'Male':0},inplace=True)
    df['Senior Citizen'].replace({'Yes':1,'No':0},inplace=True)
    df['Partner'].replace({'Yes':1,'No':0},inplace=True)
    df['Dependents'].replace({'Yes':1,'No':0},inplace=True)
    df['Phone Service'].replace({'Yes':1,'No':0},inplace=True)
    df['Paperless Billing'].replace({'Yes':1,'No':0},inplace=True)
    
    # попытка фиче генерации 
    # признак 'money':  сколько всего за все время клиент денег принес
    df['money'] = df['Monthly Charges'] * df['Tenure Months'] 
    # признак 'dist':  расстояние от абстрактного центра (чтобы хоть как-то учесть местоположение)
    df['dist'] = np.sqrt(df['Latitude'] ** 2 + df['Longitude']**2)
    tmp  = ['Latitude', 'Longitude']
    df.drop(tmp, axis=1, inplace = True)
     
    
    # побьем кол-во месяцев на группы
    def Tenure_Months_to_parts(x):
        if x<=10:
            return 'tenure_low'
        elif 10<x<20:
            return 'tenure_norm1'
        elif 20<=x<45:
            return 'tenure_norm2'
        elif 45<=x<60:
            return 'tenure_norm3'
        else:
            return 'tenure_high'
        
    df['Tenure Months'] = df['Tenure Months'].apply(Tenure_Months_to_parts)
    
    
    #o-h-encoding категориальных
    cols_ohe = ['Internet Service', 'Contract', 
                'Payment Method','Online Security',
                'Online Backup','Tech Support',
               'Streaming TV', 'Streaming Movies',
               'Device Protection','Multiple Lines',
                'Tenure Months', 'Paperless Billing']
    
    df = pd.get_dummies(data=df, columns= cols_ohe)
    
    # возникает очень много признаков скоррелированных и не информативных
    # выбросим их
    cols_to_drop = ['Online Security_No internet service',
                    'Online Backup_No internet service',
                    'Tech Support_No internet service',
                    'Streaming TV_No internet service',
                    'Streaming Movies_No internet service',
                    'Device Protection_No internet service',
                    'Multiple Lines_No phone service']
    
    df.drop(cols_to_drop, axis=1, inplace = True)
    
    return df

## Model

In [3]:
df = pd.read_csv('kaggle_train_churn.csv')
df = preprocessing(df)
question_cols = ['Churn Score', 'CLTV', 'Churn Reason']
df.drop(question_cols, axis=1, inplace = True)
df.drop_duplicates(inplace=True)
df['Churn_Value'] = df['Churn Value']
df.drop(['Churn Value'],axis=1,inplace=True)

# Скалируем данные
scaler = MinMaxScaler()
large_cols = list(df.columns[:-1])
df[large_cols] = scaler.fit_transform(df[large_cols])

In [4]:
target = 'Churn_Value'
X = df.drop(target, axis=1)
y = df[target]



# оптимальные гиперпараметры для xgboost, random forest
# для очевидных параметров типа 'max_depth', 'n_estimators' применялся GridSearch
# для остальных optuna (который по сути как RandomizeSearch)

param_x = {'alpha': 8,
 'colsample_bytree': 0.5,
 'eta': 0.5,
 'gamma': 119,
 'max_delta_step': 9,
 'max_depth': 6,
 'min_child_weight': 2,
 'n_estimators': 967,
 'refresh_leaf': 1,
 'scale_pos_weight': 77,
 'subsample': 1.0,
  'random_state':42,
          'use_label_encoder':False}


params={'bootstrap': 'True',
 'criterion': 'entropy',
 'max_depth': 3625,
 'max_features': 'sqrt',
 'max_leaf_nodes': 128,
 'min_samples_leaf': 5,
 'min_samples_split': 8,
 'n_estimators': 441,
       'random_state':0}

model_1 = RandomForestClassifier(**params)
model_x = XGBClassifier(**param_x)
model_lgbm = LGBMClassifier(max_depth=7,n_estimators=70)


model_2 = Pipeline(steps=[("classifier",model_x)])
model_3 = Pipeline(steps=[("classifier",model_lgbm)])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model_1.fit(X_train.values, y_train.values)


model_2.fit(X_train, y_train,
          classifier__eval_set=[(X_test, y_test)],
          classifier__eval_metric =["auc"],
          classifier__verbose=100,
          classifier__early_stopping_rounds=100)

model_3.fit(X_train, y_train,
          classifier__eval_set=[(X_test, y_test)],
          classifier__eval_metric =["auc"])


preds_1 = model_1.predict_proba(X_test.values)
preds_2 = model_2.predict_proba(X_test)
preds_3 = model_3.predict_proba(X_test)
 
roc_auc_score(y_test, preds_1[:,1])

[0]	validation_0-auc:0.74848
[100]	validation_0-auc:0.85014
[178]	validation_0-auc:0.84787
[1]	valid_0's auc: 0.837406	valid_0's binary_logloss: 0.550292
[2]	valid_0's auc: 0.83935	valid_0's binary_logloss: 0.5277
[3]	valid_0's auc: 0.841711	valid_0's binary_logloss: 0.508793
[4]	valid_0's auc: 0.843665	valid_0's binary_logloss: 0.49383
[5]	valid_0's auc: 0.845363	valid_0's binary_logloss: 0.481146
[6]	valid_0's auc: 0.849152	valid_0's binary_logloss: 0.469871
[7]	valid_0's auc: 0.848237	valid_0's binary_logloss: 0.461252
[8]	valid_0's auc: 0.849533	valid_0's binary_logloss: 0.453353
[9]	valid_0's auc: 0.84973	valid_0's binary_logloss: 0.447009
[10]	valid_0's auc: 0.850394	valid_0's binary_logloss: 0.441622
[11]	valid_0's auc: 0.851054	valid_0's binary_logloss: 0.436574
[12]	valid_0's auc: 0.852076	valid_0's binary_logloss: 0.431921
[13]	valid_0's auc: 0.852156	valid_0's binary_logloss: 0.42846
[14]	valid_0's auc: 0.851704	valid_0's binary_logloss: 0.42567
[15]	valid_0's auc: 0.851629	

0.8540188639621624

In [5]:
df_test = pd.read_csv('kaggle_test_churn.csv')
df_test = preprocessing(df_test)
df_test = scaler.transform(df_test)

preds1=np.array(model_1.predict_proba(df_test))
preds2=np.array(model_2.predict_proba(df_test))
preds3=np.array(model_3.predict_proba(df_test))


# параметры весов моделей подобраны руками
preds = preds1*0.4 + preds2*0.3 + preds3*0.3


test_submission_4 = pd.DataFrame(data={'Id':[i for i in range(1761)],'Predicted': preds[:,1]})

In [8]:
#compression_opts = dict(method='zip',
#                        archive_name='final4.csv')  
#test_submission_4.to_csv('final4.zip', index=False,
#          compression=compression_opts)  

In [9]:
# варианты параметров (хуже)
param_x = {'alpha': 8,
 'colsample_bytree': 0.5,
 'eta': 0.5,
 'gamma': 119,
 'max_delta_step': 9,
 'max_depth': 6,
 'min_child_weight': 2,
 'n_estimators': 967,
 'refresh_leaf': 1,
 'scale_pos_weight': 77,
 'subsample': 1.0,
  'random_state':42} 
params_x={'alpha': 10,
 'colsample_bytree': 0.5,
 'eta': 0.1,
 'gamma': 76,
 'max_delta_step': 2,
 'max_depth': 13,
 'min_child_weight': 16,
 'n_estimators': 602,
 'refresh_leaf': 0,
 'scale_pos_weight': 34,
 'subsample': 1.0}
    
    
param ={'bootstrap': 'False',
 'criterion': 'entropy',
 'max_depth': 4153,
 'max_features': 'auto',
 'max_leaf_nodes': 1410,
 'min_samples_leaf': 8,
 'min_samples_split': 4,
 'n_estimators': 88,
  'random_state':2}
