In [1]:
import re
from typing import List

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import optuna
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

tqdm.pandas()

In [2]:
# XGB
# fillna числовых колонок как средние значения по соотв колонке,
# TENURE & REGION OneHotEncoded 
# StScaler on whole dataset 
# target endocding by region and tenure

#import data
train = pd.read_csv('./data/Train_folds.zip')
test=  pd.read_csv('./data/Test.zip')
submission = pd.read_csv('./data/SampleSubmission.csv')

cat_cols = [
    'REGION',
    'TENURE',
    'TOP_PACK'
]

num_cols = [
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
]

target = 'CHURN'

mapping = {
    'D 3-6 month': 1,
    'E 6-9 month': 2,
    'F 9-12 month': 3,
    'G 12-15 month': 4,
    'H 15-18 month': 5,
    'I 18-21 month': 6,
    'J 21-24 month': 7,
    'K > 24 month': 8,
    'OTHER': 9
}

train['TOP_PACK'] = train['TOP_PACK'].fillna('OTHER')
test['TOP_PACK'] = test['TOP_PACK'].fillna('OTHER')

train['TENURE'] = train['TENURE'].fillna('OTHER')
test['TENURE'] = test['TENURE'].fillna('OTHER')
train['TENURE'] = train['TENURE'].map(mapping)
test['TENURE'] = test['TENURE'].map(mapping)

train['REGION'] = train['REGION'].fillna('OTHER')
test['REGION'] = test['REGION'].fillna('OTHER')

for nc in tqdm(num_cols):
    mean = train[nc].mean()
    train[nc] = train[nc].fillna(mean)
    test[nc] = test[nc].fillna(mean)
    
train.shape, test.shape

  0%|          | 0/13 [00:00<?, ?it/s]

((2154048, 20), (380127, 18))

In [3]:
churn_by_tenure = pd.read_csv('./data/agg_by_tenure_churn.csv')
churn_by_tenure = churn_by_tenure.append(pd.DataFrame({'TENURE': [9], 'CHURN_mean': 0, 'CHURN_median': 0}))

train = pd.merge(train, churn_by_tenure[['TENURE', 'CHURN_mean']], left_on='TENURE', right_on='TENURE', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_TENURE'}, axis='columns')

test = pd.merge(test, churn_by_tenure[['TENURE', 'CHURN_mean']], left_on='TENURE', right_on='TENURE', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_TENURE'}, axis='columns')

train.shape, test.shape

((2154048, 21), (380127, 19))

In [4]:
churn_by_region = pd.read_csv('./data/agg_by_region_churn.csv')

vc = train[train['REGION'] == 'OTHER']['CHURN'].value_counts()
churn_by_region_mean = vc[1]/(vc[0]+vc[1])
churn_by_region = churn_by_region.append(pd.DataFrame({'REGION': ['OTHER'], 'CHURN_mean': churn_by_region_mean, 'CHURN_median': 0}))

train = pd.merge(train, churn_by_region[['REGION', 'CHURN_mean']], left_on='REGION', right_on='REGION', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_REGION'}, axis='columns')

test = pd.merge(test, churn_by_region[['REGION', 'CHURN_mean']], left_on='REGION', right_on='REGION', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_REGION'}, axis='columns')

train.shape, test.shape

((2154048, 22), (380127, 20))

In [5]:
# churn_by_top_pack = train[['TOP_PACK', 'CHURN']].groupby('TOP_PACK').agg({'CHURN': ['mean', 'median']})
# churn_by_top_pack.columns = ['_'.join(col).strip() for col in churn_by_top_pack.columns.values]
# churn_by_top_pack_mean = np.mean(train[train['TOP_PACK'] == 'OTHER']['CHURN'])
# churn_by_top_pack = churn_by_top_pack.reset_index()

# d = {
#     'TOP_PACK': ['OTHER'],
#     'CHURN_mean': [churn_by_top_pack_mean],
#     'CHURN_median': [0]
# }

# for tp in test['TOP_PACK'].unique():
#     if tp not in churn_by_top_pack.index:
#         d['TOP_PACK'].append(tp)
#         d['CHURN_mean'].append(churn_by_top_pack_mean)
#         d['CHURN_median'].append(0)
    
# churn_by_top_pack = churn_by_top_pack.append(pd.DataFrame(d))
# churn_by_top_pack.index = range(len(churn_by_top_pack))

# train = pd.merge(train, churn_by_top_pack[['TOP_PACK', 'CHURN_mean']], left_on='TOP_PACK', right_on='TOP_PACK', how='left')
# train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_TOP_PACK'}, axis='columns')

# test = pd.merge(test, churn_by_top_pack[['TOP_PACK', 'CHURN_mean']], left_on='TOP_PACK', right_on='TOP_PACK', how='left')
# test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_TOP_PACK'}, axis='columns')

# train.shape, test.shape

In [6]:
# train['TOP_PACK'] = train['TOP_PACK'].fillna('OTHER')
# test['TOP_PACK'] = test['TOP_PACK'].fillna('OTHER')

churn_by_top_pack = train[['TOP_PACK', 'CHURN']].groupby('TOP_PACK').agg({'CHURN': ['mean', 'median']})
churn_by_top_pack.columns = ['_'.join(col).strip() for col in churn_by_top_pack.columns.values]
churn_by_top_pack_mean = np.mean(train[train['TOP_PACK'] == 'OTHER']['CHURN'])
churn_by_top_pack = churn_by_top_pack.reset_index()

d = {
    'TOP_PACK': [],
    'CHURN_mean': [],
    'CHURN_median': []
}

for tp in test['TOP_PACK'].unique():
    if tp not in churn_by_top_pack['TOP_PACK'].unique():
        d['TOP_PACK'].append(tp)
        d['CHURN_mean'].append(churn_by_top_pack_mean)
        d['CHURN_median'].append(0)
    
churn_by_top_pack = churn_by_top_pack.append(pd.DataFrame(d))

train = pd.merge(train, churn_by_top_pack[['TOP_PACK', 'CHURN_mean']], left_on='TOP_PACK', right_on='TOP_PACK', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_TOP_PACK'}, axis='columns')

test = pd.merge(test, churn_by_top_pack[['TOP_PACK', 'CHURN_mean']], left_on='TOP_PACK', right_on='TOP_PACK', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_TOP_PACK'}, axis='columns')

train.shape, test.shape

((2154048, 23), (380127, 21))

In [7]:
train.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN,kfold,MEAN_CHURN_BY_TENURE,MEAN_CHURN_BY_REGION,MEAN_CHURN_BY_TOP_PACK
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,8,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,...,2.0,NO,54,On net 200F=Unlimited _call24H,8.0,0,1,0.017714,0.014196,0.018976
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,OTHER,6,5532.116998,11.52912,5510.810334,1836.942894,13.978141,3366.450167,277.68914,...,7.553309,NO,4,OTHER,9.272461,1,2,0.024106,0.447987,0.391899
2,00001654a9d9f96303d9969d0a4a851714a4bb57,OTHER,8,3600.0,2.0,1020.0,340.0,2.0,3366.450167,90.0,...,7.553309,NO,17,On-net 1000F=10MilF;10d,1.0,0,1,0.017714,0.447987,0.014089
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,8,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,...,7.553309,NO,62,"Data:1000F=5GB,7d",11.0,0,1,0.017714,0.019235,0.101716
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,8,1000.0,1.0,985.0,328.0,1.0,3366.450167,39.0,...,7.553309,NO,11,Mixt 250F=Unlimited_call24H,2.0,0,4,0.017714,0.019235,0.042141


In [8]:
useful_cols = [
    'REGION',
    'TENURE',
    # 'MRG',  # constant
    'TOP_PACK',  # wtf column
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
    'MEAN_CHURN_BY_TENURE',
    'MEAN_CHURN_BY_REGION',
    'MEAN_CHURN_BY_TOP_PACK'
]

for cat_col in cat_cols:
    encoder = OneHotEncoder(handle_unknown='ignore')
    unique_values = train[cat_col].unique()

    one_hot_encoded_cols = [f'{cat_col}_{i}' for i in range(len(unique_values))]
    
    ohe_df = pd.DataFrame(encoder.fit_transform(train[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = train.index
    train = train.drop(cat_col, axis=1)
    train = pd.concat([train, ohe_df], axis=1)        
    print(f'[{cat_col}] xtrain transformed')

    ohe_df = pd.DataFrame(encoder.transform(test[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = test.index
    test = test.drop(cat_col, axis=1)
    test = pd.concat([test, ohe_df], axis=1)
    print(f'[{cat_col}] xtest transformed')
    
    useful_cols += one_hot_encoded_cols
    useful_cols.remove(cat_col)
    
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

[REGION] xtrain transformed
[REGION] xtest transformed
[TENURE] xtrain transformed
[TENURE] xtest transformed
[TOP_PACK] xtrain transformed
[TOP_PACK] xtest transformed


In [9]:
poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(train[num_cols])
test_poly = poly.fit_transform(test[num_cols])

poly_columns = [f'poly_{x.replace(" ", "__")}' for x in poly.get_feature_names(num_cols)] # [f"poly_{i}" for i in range(train_poly.shape[1])]
df_poly = pd.DataFrame(train_poly, columns=poly_columns, dtype=np.float32)
df_test_poly = pd.DataFrame(test_poly, columns=poly_columns, dtype=np.float32)

train = pd.concat([train, df_poly], axis=1)
test = pd.concat([test, df_test_poly], axis=1)

useful_cols += poly_columns

train.head()



Unnamed: 0,user_id,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,poly_TIGO__ZONE1__ZONE2,poly_TIGO__ZONE1__REGULARITY,poly_TIGO__ZONE1__FREQ_TOP_PACK,poly_TIGO__ZONE2__REGULARITY,poly_TIGO__ZONE2__FREQ_TOP_PACK,poly_TIGO__REGULARITY__FREQ_TOP_PACK,poly_ZONE1__ZONE2__REGULARITY,poly_ZONE1__ZONE2__FREQ_TOP_PACK,poly_ZONE1__REGULARITY__FREQ_TOP_PACK,poly_ZONE2__REGULARITY__FREQ_TOP_PACK
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,-0.2238504,0.3246513,-0.2152853,-0.2152889,0.252579,-0.3546942,0.1586511,-0.3153654,-0.5490724,...,-0.2241625,0.396786,-0.04631197,0.420789,-0.04911356,0.08693501,0.4754966,-0.0554989,0.09823759,0.1041803
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.0,-4.7969860000000006e-17,8.175326e-17,9.068654000000001e-17,1.764598e-16,...,-0.0,-2.9261220000000003e-32,0.0,4.006599e-32,-0.0,-3.612546e-32,3.490163e-32,-0.0,-3.146902e-32,4.308902e-32
2,00001654a9d9f96303d9969d0a4a851714a4bb57,-0.3373367,-0.8913132,-0.7674215,-0.7674261,-1.00118,-4.7969860000000006e-17,-0.269938,-0.3153654,-0.4000654,...,1.294332e-32,3.0469520000000005e-17,5.4348530000000004e-17,-4.172045e-17,-7.441683e-17,-0.1751826,1.6030000000000002e-32,2.859275e-32,6.730939e-17,-9.216352000000001e-17
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,1.391147,0.3246513,1.365591,1.365761,0.336163,4.265629,-0.3404107,0.04199848,-0.5242379,...,1.696068e-32,-1.22781e-16,-1.48723e-17,1.681181e-16,2.036393e-17,-0.1474176,-4.929486e-32,-5.971025e-33,4.322517e-17,-5.918616000000001e-17
4,000028d9e13a595abe061f9b58f3d76ab907850f,-0.7912821,-0.9848489,-0.7734026,-0.773578,-1.084764,-4.7969860000000006e-17,-0.3432872,-0.4557583,1.764598e-16,...,-0.0,-2.074179e-32,-2.107411e-32,2.840073e-32,2.885577e-32,1.048379e-16,2.473999e-32,2.513637e-32,9.132468e-17,-1.250465e-16


In [10]:
sum(train.memory_usage())/1024/1024

6138.129638671875

In [11]:
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    return df


def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df


def optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame:
    for col in df.select_dtypes(include=['object']):
        if col not in datetime_features:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if float(num_unique_values) / num_total_values < 0.5:
                df[col] = df[col].astype('category')
        else:
            df[col] = pd.to_datetime(df[col])
    return df



def optimize(df: pd.DataFrame, datetime_features: List[str] = []):
    return optimize_floats(optimize_ints(optimize_objects(df, datetime_features)))

train = optimize(train, [])

In [12]:
sum(train.memory_usage())/1024/1024

4615.922901153564

In [15]:
train.to_csv('./data/train.full.csv', index=None)

In [13]:
final_test_predictions = []
final_valid_predictions = {}

scores = []

for fold in tqdm(range(5), 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]
    
    valid_ids = train[train['kfold'] == fold]['user_id'].values.tolist()
    
    xtest = test[useful_cols]

    model = XGBClassifier(
        n_estimators=7000,
        n_jobs=-1,
        random_state=42,
        tree_method='gpu_hist',
        gpu_id=0,
        predictor="gpu_predictor",
#         **{
#             'learning_rate': 0.021655316351235455,
#             'reg_lambda': 1.0883078718317323e-07,
#             'reg_alpha': 0.00015120241798978777,
#             'subsample': 0.7179552032665535,
#             'colsample_bytree': 0.7408152702492675,
#             'max_depth': 7
#         }
        **{
            'learning_rate': 0.014461849398074727,
            'reg_lambda': 0.08185850904776007,
            'reg_alpha': 0.0001173486815850512,
            'subsample': 0.7675905290878289,
            'colsample_bytree': 0.2708299922996371,
            'max_depth': 7
        }
    )    
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        
    preds_valid = model.predict_proba(xvalid)[:, 1]
    test_preds = model.predict_proba(xtest)[:, 1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)    

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("./data/train_pred_1.csv", index=False)

sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("./data/test_pred_1.csv", index=False)

# final_predictions = []
# scores = []

# for fold in tqdm(range(5), 'folds'):
#     xtrain = train[train['kfold'] != fold][useful_cols]
#     ytrain = train[train['kfold'] != fold][target]
    
#     xvalid = train[train['kfold'] == fold][useful_cols]
#     yvalid = train[train['kfold'] == fold][target]
    
#     xtest = test[useful_cols]

#     model = XGBClassifier(
#         n_estimators=7000,
#         n_jobs=-1,
#         random_state=42,
#         tree_method='gpu_hist',
#         gpu_id=0,
#         predictor="gpu_predictor",
# #         **{'learning_rate': 0.02981286840846979,
# #            'reg_lambda': 2.1119486166373553e-06,
# #            'reg_alpha': 0.09652271602187434,
# #            'subsample': 0.2972622031653025,
# #            'colsample_bytree': 0.3291720075373176,
# #            'max_depth': 2}
# #         **{'learning_rate': 0.03359830446697092,
# #            'reg_lambda': 0.0013493600461741606,
# #            'reg_alpha': 0.0002728448162129134,
# #            'subsample': 0.13373120583933554,
# #            'colsample_bytree': 0.1386996438938067,
# #            'max_depth': 7},
#         **{
#             'learning_rate': 0.021655316351235455,
#             'reg_lambda': 1.0883078718317323e-07,
#             'reg_alpha': 0.00015120241798978777,
#             'subsample': 0.7179552032665535,
#             'colsample_bytree': 0.7408152702492675,
#             'max_depth': 7
#         }
#     )    
#     model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
#     preds_valid = model.predict_proba(xvalid)[:, 1]
#     test_preds = model.predict_proba(xtest)[:, 1]
#     final_predictions.append(test_preds)
#     score = roc_auc_score(yvalid, preds_valid)
#     scores.append(score)
#     print(fold, score)

# print(np.mean(scores), np.std(scores))


# 0.9314604358446612 0.000506497423655064

folds:   0%|          | 0/5 [00:00<?, ?it/s]



XGBoostError: bad allocation

In [11]:
# xtrain = train[train['kfold'] != 1][useful_cols]
# print(len(xtrain.columns), len(set(xtrain.columns)))
# xtrain.columns.to_series()[np.isinf(xtrain).any()]

557 557


Series([], dtype: object)

In [14]:
# xtrain[np.isinf(xtrain['poly_MONTANT__FREQUENCE_RECH__ZONE1'])][['MONTANT', 'FREQUENCE_RECH', 'ZONE1', 'poly_MONTANT__FREQUENCE_RECH__ZONE1']]


xtrain[np.isinf(xtrain['poly_MONTANT__REVENUE__ARPU_SEGMENT'])][['MONTANT', 'REVENUE', 'ARPU_SEGMENT', 'poly_MONTANT__REVENUE__ARPU_SEGMENT']]

Unnamed: 0,MONTANT,REVENUE,ARPU_SEGMENT,poly_MONTANT__REVENUE__ARPU_SEGMENT
1557266,44.34136,44.522786,44.522692,inf
2031710,39.190826,45.572203,45.572112,inf


In [27]:
# train[train['MEAN_CHURN_BY_TOP_PACK'].isna()][['MEAN_CHURN_BY_TOP_PACK', 'CHURN']]
train[[col for col in train.columns if not col.startswith('poly') and not col.startswith('TOP_PACK_') and not col.startswith('REGION_') and not col.startswith('TENURE_')]]

Unnamed: 0,user_id,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,FREQ_TOP_PACK,CHURN,kfold,MEAN_CHURN_BY_TENURE,MEAN_CHURN_BY_REGION,MEAN_CHURN_BY_TOP_PACK
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,-2.238504e-01,3.246513e-01,-2.152853e-01,-2.152889e-01,0.252579,-3.546942e-01,1.586511e-01,-3.153654e-01,-5.490724e-01,-6.204583e-01,-6.579922e-01,NO,1.164700,-1.359412e-01,0,1,0.017714,0.014196,
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.000000,-4.796986e-17,8.175326e-17,9.068654e-17,1.764598e-16,1.537148e-16,-2.104743e-16,NO,-1.078775,1.897739e-16,1,2,0.024106,0.447987,0.391899
2,00001654a9d9f96303d9969d0a4a851714a4bb57,-3.373367e-01,-8.913132e-01,-7.674215e-01,-7.674261e-01,-1.001180,-4.796986e-17,-2.699380e-01,-3.153654e-01,-4.000654e-01,1.537148e-16,-2.104743e-16,NO,-0.495472,-8.837738e-01,0,1,0.017714,0.447987,
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,1.391147e+00,3.246513e-01,1.365591e+00,1.365761e+00,0.336163,4.265629e+00,-3.404107e-01,4.199848e-02,-5.242379e-01,1.537148e-16,-2.104743e-16,NO,1.523656,1.845585e-01,0,1,0.017714,0.019235,
4,000028d9e13a595abe061f9b58f3d76ab907850f,-7.912821e-01,-9.848489e-01,-7.734026e-01,-7.735780e-01,-1.084764,-4.796986e-17,-3.432872e-01,-4.557583e-01,1.764598e-16,1.537148e-16,-2.104743e-16,NO,-0.764689,-7.769406e-01,0,4,0.017714,0.019235,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154043,ffffe85215ddc71a84f95af0afb0deeea90e6967,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.000000,-4.796986e-17,8.175326e-17,9.068654e-17,1.764598e-16,1.537148e-16,-2.104743e-16,NO,-0.989036,1.897739e-16,0,2,0.017714,0.447987,0.391899
2154044,ffffeaaa9289cdba0ac000f0ab4b48f4aa74ed15,9.914917e-02,3.246513e-01,4.941878e-02,4.924485e-02,0.085411,-2.896089e-01,-3.619840e-01,-3.536544e-01,4.194734e-01,1.537148e-16,-2.104743e-16,NO,1.209569,-2.910794e-02,0,3,0.017714,0.016301,
2154045,fffff172fda1b4bb38a95385951908bb92379809,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.000000,-4.796986e-17,8.175326e-17,9.068654e-17,1.764598e-16,1.537148e-16,-2.104743e-16,NO,-1.213384,1.897739e-16,1,2,0.017714,0.447987,0.391899
2154046,fffff5911296937a37f09a37a549da2e0dad6dbb,7.800672e-01,-4.949161e-02,2.749897e-01,2.748162e-01,-0.081757,-4.796986e-17,-3.993777e-01,2.844954e-01,-2.510583e-01,1.537148e-16,-2.104743e-16,NO,-0.001907,2.913918e-01,0,4,0.017714,0.016301,


In [8]:
sample_submission.sample(7)

Unnamed: 0,id,pred_1
213209,8f97c91960e43ea41b61dbc2f82ad5db59bc639c,0.000372
24987,1095e0aaa58458d46b99fdf3b431a32fb63ec6ec,0.84448
368242,f7e411c2832c4d28fd5d9699f64f1d185232d71f,0.063214
92387,3e3b06f39df9f3fb0114c431cf4bbf1e2a08750a,0.024811
364914,f596fee72b098b7aa5f8f685e68a4f3b14e4e674,0.679964
202769,887b927dbd48c21a3d34ea92d775721de292904b,0.001774
210002,8d5fb407a19eede47ddda7ed8e0d06b9fbfe9e44,0.003534


In [14]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

submission = pd.read_csv('./data/SampleSubmission.csv')
submission.CHURN = preds
submission.to_csv('./data/submission-xgb-proba-poly-features.csv', index=False)

In [14]:
final_test_predictions = []
final_valid_predictions = {}

scores = []

for fold in tqdm(range(5), 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]

    valid_ids = train[train['kfold'] == fold]['user_id'].values.tolist()

    xtest = test[useful_cols]

    lgb_model = LGBMClassifier(
        n_estimators=7000,
        n_jobs=-1,
        random_state=42,
#         **{
#             'learning_rate': 0.03881855209002591,
#             'reg_lambda': 0.009591673857338072,
#             'reg_alpha': 0.5065599259874649,
#             'subsample': 0.4016863186957058,
#             'colsample_bytree': 0.9360889506340332,
#             'max_depth': 4
#         }
        **{
            'learning_rate': 0.029253877255476443,
            'reg_lambda': 16.09426889606859,
            'reg_alpha': 0.014354120473120952,
            'subsample': 0.43289663848783977,
            'colsample_bytree': 0.5268279718406376,
            'max_depth': 6}
    )    
    lgb_model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        
    preds_valid = lgb_model.predict_proba(xvalid)[:, 1]
    test_preds = lgb_model.predict_proba(xtest)[:, 1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)    

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("./data/train_pred_2.csv", index=False)

sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("./data/test_pred_2.csv", index=False)

sample_submission.sample(7)

folds:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.251598
Early stopping, best iteration is:
[1123]	valid_0's binary_logloss: 0.251593
0 0.9314600527915347
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.250533
Early stopping, best iteration is:
[907]	valid_0's binary_logloss: 0.250528
1 0.9320697780001863
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.251623
Early stopping, best iteration is:
[1069]	valid_0's binary_logloss: 0.251622
2 0.9311173605227702
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.250705
Early stopping, best iteration is:
[1104]	valid_0's binary_logloss: 0.250697
3 0.9317311842369478
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.252328
Early stopping, best iteration is:
[1306]	valid_0's binary_logloss: 0.252318
4 0.93064809

Unnamed: 0,id,pred_2
7356,04c88c7588834b50fe8639f5159085f7dd78be48,0.788534
125943,54c23db79930ae0b41007780104f45c9f893c25a,0.000159
148204,63c8766c4c9a001097e8ffdede28459f74baf030,0.122185
142579,5ff52fd8fa6d81aa1de768bfb0cce25915903c90,0.000541
297126,c817ebca634476348d025999046ac7390b528b09,0.480275
51583,2293084adca3e7059b075a6b711d80fc0f4d1add,0.221469
262231,b08100163c13cab0cffc77e1ac6fdd249b1d0e2b,0.012529


In [15]:
final_test_predictions = []
final_valid_predictions = {}

scores = []

for fold in tqdm(range(5), 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]

    valid_ids = train[train['kfold'] == fold]['user_id'].values.tolist()

    xtest = test[useful_cols]

    cb_model = CatBoostClassifier(
        n_estimators=1000,
        random_state=42,
        **{
            'objective': 'CrossEntropy',
            'colsample_bylevel': 0.054208119366927966,
            'depth': 12,
            'boosting_type': 'Ordered',
            'bootstrap_type': 'Bernoulli',
            'subsample': 0.9494580379034286
        }
    )
    cb_model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        
    preds_valid = cb_model.predict_proba(xvalid)[:, 1]
    test_preds = cb_model.predict_proba(xtest)[:, 1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)    

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("./data/train_pred_3.csv", index=False)

sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_3"]
sample_submission.to_csv("./data/test_pred_3.csv", index=False)

sample_submission.sample(7)

folds:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.6413141	test: 0.6413556	best: 0.6413556 (0)	total: 5.53s	remaining: 1h 32m 3s
999:	learn: 0.2470295	test: 0.2518194	best: 0.2518079 (819)	total: 1h 25m 38s	remaining: 0us

bestTest = 0.2518079163
bestIteration = 819

Shrink model to first 820 iterations.
0 0.9313567227677886
0:	learn: 0.6497885	test: 0.6497224	best: 0.6497224 (0)	total: 5.8s	remaining: 1h 36m 38s
999:	learn: 0.2473597	test: 0.2507459	best: 0.2507292 (763)	total: 1h 27m 8s	remaining: 0us

bestTest = 0.2507291903
bestIteration = 763

Shrink model to first 764 iterations.
1 0.9319608828638268
0:	learn: 0.6414522	test: 0.6414625	best: 0.6414625 (0)	total: 5.6s	remaining: 1h 33m 18s
999:	learn: 0.2470701	test: 0.2517396	best: 0.2517373 (945)	total: 1h 28m 15s	remaining: 0us

bestTest = 0.2517372827
bestIteration = 945

Shrink model to first 946 iterations.
2 0.931060832487163
0:	learn: 0.6425699	test: 0.6425916	best: 0.6425916 (0)	total: 5.19s	remaining: 1h 26m 23s
999:	learn: 0.2474026	test: 0.2508440	best: 0.2

Unnamed: 0,id,pred_3
132639,593e3d29f23b3274c6162349c5930b3aab6025e7,0.112868
14979,09dcefee315d30d89bb5a58425a4e181a9423fe4,0.225612
355317,ef36435c34f8511dc56447b46bf3caeef811a7af,0.01721
204181,89737acc186e4aeba72800b120000449071a05bc,0.000841
228710,9a2e2a3c322ef0f3014b270ac7d4d6072eb62fbc,0.052753
42600,1c7926e4f0a4dbaa48673887fc61e2cf44589d61,0.006899
160268,6bfffe7c80ee539ae339ea50f0dc3844117857f7,0.479539


In [16]:
final_test_predictions = []
final_valid_predictions = {}

scores = []

# del scgb_model

for fold in tqdm(range(5), 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]

    valid_ids = train[train['kfold'] == fold]['user_id'].values.tolist()

    xtest = test[useful_cols]

    scgb_model = GradientBoostingClassifier(
        n_estimators=100,
        random_state=42, 
        verbose=1,
        max_features=0.1
#         **{
#             'objective': 'CrossEntropy',
#             'colsample_bylevel': 0.054208119366927966,
#             'depth': 12,
#             'boosting_type': 'Ordered',
#             'bootstrap_type': 'Bernoulli',
#             'subsample': 0.9494580379034286
#         }
    )
    scgb_model.fit(xtrain, ytrain)
        
    preds_valid = scgb_model.predict_proba(xvalid)[:, 1]
    test_preds = scgb_model.predict_proba(xtest)[:, 1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)    

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_4"]
final_valid_predictions.to_csv("./data/train_pred_4.csv", index=False)

sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_4"]
sample_submission.to_csv("./data/test_pred_4.csv", index=False)

sample_submission.sample(7)

folds:   0%|          | 0/5 [00:00<?, ?it/s]

      Iter       Train Loss   Remaining Time 
         1           0.9149           27.23m
         2           0.8623           26.75m
         3           0.8289           26.15m
         4           0.7961           25.52m
         5           0.7584           24.83m
         6           0.7301           24.55m
         7           0.7116           24.48m
         8           0.6894           24.23m
         9           0.6754           23.98m
        10           0.6647           23.78m
        20           0.5677           21.48m
        30           0.5341           18.84m
        40           0.5219           16.23m
        50           0.5168           13.54m
        60           0.5136           10.85m
        70           0.5116            8.18m
        80           0.5106            5.48m
        90           0.5097            2.75m
       100           0.5091            0.00s
0 0.9298740831358883
      Iter       Train Loss   Remaining Time 
         1           0.8999     

Unnamed: 0,id,pred_4
122505,5270beb4c61914745c297cb72654420618a56ee1,0.045594
60777,28bc58bd7df7e05bffb4cb157dcca70c88d4cfe5,0.004816
25059,10a41a3ccae8ace4439bbce317f66f8711a93121,0.011132
202399,88378dd443b0ffc4f55eb59ffbdb96c48c9c1801,0.780955
139453,5de2b6a0889ca2b213257b990a44dcdd6db4d6de,0.011695
365023,f5ac491fa17cb5123d24b22129b9cd244917f8db,0.003853
153694,6791dba024eb4d8bf5f32daaabdd63b72cbeb024,0.780955


In [34]:
final_test_predictions = []
final_valid_predictions = {}

scores = []

for fold in tqdm(range(5), 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]

    valid_ids = train[train['kfold'] == fold]['user_id'].values.tolist()

    xtest = test[useful_cols]

    rf_model = RandomForestClassifier(
        random_state=42,
        n_jobs=-1,
        verbose=1,
        **{
            'max_depth': 15,
            'max_features': 'auto',
            'class_weight': 'balanced_subsample'
        }
    )
    rf_model.fit(xtrain, ytrain)

    preds_valid = rf_model.predict_proba(xvalid)[:, 1]
    test_preds = rf_model.predict_proba(xtest)[:, 1]
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)    

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_5"]
final_valid_predictions.to_csv("./data/train_pred_5.csv", index=False)

sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_5"]
sample_submission.to_csv("./data/test_pred_5.csv", index=False)

sample_submission.sample(7)

folds:   0%|          | 0/5 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.3s finished


0 0.9289267479994578


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.3s finished


1 0.9293812707007716


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.2s finished


2 0.9284704100973353


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.6min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.2s finished


3 0.928865915213773


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.5min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.2s finished


4 0.9277601361461523
0.9286808960314981 0.0005435628869343008


Unnamed: 0,id,pred_5
102110,44b7b952ba817a8bacbbd60fa60064e2396edf03,0.008579
252752,aa2f4a1cad48e2a8ca9abc0296b03f4f6e2fe20f,0.022509
353517,edfa3dde665995d9b8c488efe19398152d076770,0.001831
246933,a642c2576a67445aed490d9ee893de0d6e470889,0.005839
25671,1107a272c3c3bc6ae6111bca6721cc85f4732f87,0.639161
17048,0b3a1d566a94441a37e37e72a2b4275584243fb2,0.008164
292067,c4bc91cbef0f2a6f274e8005a33618aab640f95d,0.003425


In [4]:
df = train.copy() # pd.read_csv('./data/Train_folds.zip')
df_test = test.copy() # pd.read_csv('./data/Test.zip')
sample_submission = pd.read_csv('./data/SampleSubmission.csv')

df1 = pd.read_csv("./data/train_pred_1.csv")
df2 = pd.read_csv("./data/train_pred_2.csv")
df3 = pd.read_csv("./data/train_pred_3.csv")
df4 = pd.read_csv("./data/train_pred_4.csv")
df5 = pd.read_csv("./data/train_pred_5.csv")
df6 = pd.read_csv("./data/train_pred_6.csv")

df_test1 = pd.read_csv("./data/test_pred_1.csv")
df_test2 = pd.read_csv("./data/test_pred_2.csv")
df_test3 = pd.read_csv("./data/test_pred_3.csv")
df_test4 = pd.read_csv("./data/test_pred_4.csv")
df_test5 = pd.read_csv("./data/test_pred_5.csv")
df_test6 = pd.read_csv("./data/test_pred_6.csv")

df = df.merge(df1, left_on='user_id', right_on="id", how="left")
df = df.merge(df2, left_on='user_id', right_on="id", how="left")
df = df.merge(df3, left_on='user_id', right_on="id", how="left")
df = df.merge(df4, left_on='user_id', right_on="id", how="left")
df = df.merge(df5, left_on='user_id', right_on="id", how="left")
df = df.merge(df6, left_on='user_id', right_on="id", how="left")

df_test = df_test.merge(df_test1, left_on='user_id', right_on="id", how="left")
df_test = df_test.merge(df_test2, left_on='user_id', right_on="id", how="left")
df_test = df_test.merge(df_test3, left_on='user_id', right_on="id", how="left")
df_test = df_test.merge(df_test4, left_on='user_id', right_on="id", how="left")
df_test = df_test.merge(df_test5, left_on='user_id', right_on="id", how="left")
df_test = df_test.merge(df_test6, left_on='user_id', right_on="id", how="left")

df.head()

  return merge(


Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,id_y,pred_2,id_x,pred_3,id_y.1,pred_4,id_x.1,pred_5,id_y.2,pred_6
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,8,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,...,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,0.000734,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,0.000458,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,0.004463,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,0.008806,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,0.00021
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,OTHER,6,5532.116998,11.52912,5510.810334,1836.942894,13.978141,3366.450167,277.68914,...,00000cb4a5d760de88fecb38e2f71b7bec52e834,0.620179,00000cb4a5d760de88fecb38e2f71b7bec52e834,0.627454,00000cb4a5d760de88fecb38e2f71b7bec52e834,0.607547,00000cb4a5d760de88fecb38e2f71b7bec52e834,0.879773,00000cb4a5d760de88fecb38e2f71b7bec52e834,0.632568
2,00001654a9d9f96303d9969d0a4a851714a4bb57,OTHER,8,3600.0,2.0,1020.0,340.0,2.0,3366.450167,90.0,...,00001654a9d9f96303d9969d0a4a851714a4bb57,0.148125,00001654a9d9f96303d9969d0a4a851714a4bb57,0.136944,00001654a9d9f96303d9969d0a4a851714a4bb57,0.132337,00001654a9d9f96303d9969d0a4a851714a4bb57,0.30599,00001654a9d9f96303d9969d0a4a851714a4bb57,0.146315
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,8,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,...,00001dd6fa45f7ba044bd5d84937be464ce78ac2,0.000508,00001dd6fa45f7ba044bd5d84937be464ce78ac2,0.000498,00001dd6fa45f7ba044bd5d84937be464ce78ac2,0.004372,00001dd6fa45f7ba044bd5d84937be464ce78ac2,0.005401,00001dd6fa45f7ba044bd5d84937be464ce78ac2,0.001037
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,8,1000.0,1.0,985.0,328.0,1.0,3366.450167,39.0,...,000028d9e13a595abe061f9b58f3d76ab907850f,0.025337,000028d9e13a595abe061f9b58f3d76ab907850f,0.021932,000028d9e13a595abe061f9b58f3d76ab907850f,0.021137,000028d9e13a595abe061f9b58f3d76ab907850f,0.116135,000028d9e13a595abe061f9b58f3d76ab907850f,0.02321


In [5]:
df_test.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,id_y,pred_2,id_x,pred_3,id_y.1,pred_4,id_x.1,pred_5,id_y.2,pred_6
0,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,THIES,8,5000.0,5.0,5000.0,1667.0,5.0,3366.450167,378.0,...,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.001635,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.001021,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.003405,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.011723,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.001199
1,000055d41c8a62052dd426592e8a4a3342bf565d,OTHER,6,300.0,2.0,326.0,109.0,3.0,397.0,277.68914,...,000055d41c8a62052dd426592e8a4a3342bf565d,0.072572,000055d41c8a62052dd426592e8a4a3342bf565d,0.079101,000055d41c8a62052dd426592e8a4a3342bf565d,0.052962,000055d41c8a62052dd426592e8a4a3342bf565d,0.169566,000055d41c8a62052dd426592e8a4a3342bf565d,0.06925
2,000081dd3245e6869a4a9c574c7050e7bb84c2c8,DAKAR,8,3300.0,25.0,3400.0,1133.0,26.0,7150.0,0.0,...,000081dd3245e6869a4a9c574c7050e7bb84c2c8,0.000761,000081dd3245e6869a4a9c574c7050e7bb84c2c8,0.000418,000081dd3245e6869a4a9c574c7050e7bb84c2c8,0.004775,000081dd3245e6869a4a9c574c7050e7bb84c2c8,0.0205,000081dd3245e6869a4a9c574c7050e7bb84c2c8,0.000566
3,0000b76d2145d9445d9ff6b65c9ebc4196c89337,OTHER,8,5532.116998,11.52912,5510.810334,1836.942894,13.978141,3366.450167,277.68914,...,0000b76d2145d9445d9ff6b65c9ebc4196c89337,0.395094,0000b76d2145d9445d9ff6b65c9ebc4196c89337,0.394372,0000b76d2145d9445d9ff6b65c9ebc4196c89337,0.410321,0000b76d2145d9445d9ff6b65c9ebc4196c89337,0.770143,0000b76d2145d9445d9ff6b65c9ebc4196c89337,0.403103
4,0000bae5480628cf8fe51ad84bcb39772fc79224,OTHER,8,5532.116998,11.52912,5510.810334,1836.942894,13.978141,3366.450167,277.68914,...,0000bae5480628cf8fe51ad84bcb39772fc79224,0.375482,0000bae5480628cf8fe51ad84bcb39772fc79224,0.374414,0000bae5480628cf8fe51ad84bcb39772fc79224,0.39553,0000bae5480628cf8fe51ad84bcb39772fc79224,0.765398,0000bae5480628cf8fe51ad84bcb39772fc79224,0.384336


In [7]:
df[["pred_1", "pred_2", "pred_3", "pred_4", "pred_5", "pred_6", 'CHURN']]

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,CHURN
0,0.000192,0.000734,0.000458,0.004463,0.008806,0.000210,0
1,0.869597,0.620179,0.627454,0.607547,0.879773,0.632568,1
2,0.282249,0.148125,0.136944,0.132337,0.305990,0.146315,0
3,0.000246,0.000508,0.000498,0.004372,0.005401,0.001037,0
4,0.104483,0.025337,0.021932,0.021137,0.116135,0.023210,0
...,...,...,...,...,...,...,...
2154043,0.785616,0.478703,0.476531,0.473810,0.822525,0.470364,0
2154044,0.001238,0.000685,0.000579,0.003716,0.009888,0.000580,0
2154045,0.937631,0.788797,0.789958,0.782949,0.899320,0.782412,1
2154046,0.007637,0.004660,0.009403,0.009744,0.041266,0.004415,0


In [27]:
sorted(dict(zip(lgb_model.feature_name_, lgb_model.feature_importances_)).items(), key=lambda x: -x[1])

[('REGULARITY', 820),
 ('MEAN_CHURN_BY_REGION', 521),
 ('MEAN_CHURN_BY_TOP_PACK', 441),
 ('poly_REGULARITY', 399),
 ('poly_DATA_VOLUME__REGULARITY', 345),
 ('DATA_VOLUME', 297),
 ('poly_ON_NET__REGULARITY', 295),
 ('MEAN_CHURN_BY_TENURE', 285),
 ('poly_ZONE2__REGULARITY', 233),
 ('poly_DATA_VOLUME__ON_NET__REGULARITY', 215),
 ('poly_DATA_VOLUME__ON_NET__ZONE1', 211),
 ('poly_DATA_VOLUME__ORANGE', 209),
 ('poly_DATA_VOLUME__ZONE2__REGULARITY', 205),
 ('poly_FREQUENCE__REGULARITY', 195),
 ('ON_NET', 194),
 ('poly_ON_NET__ORANGE__REGULARITY', 190),
 ('poly_ON_NET__TIGO__FREQ_TOP_PACK', 190),
 ('poly_ON_NET__ORANGE__TIGO', 189),
 ('poly_ORANGE__REGULARITY', 187),
 ('poly_ORANGE__REGULARITY__FREQ_TOP_PACK', 183),
 ('poly_ON_NET__ORANGE', 182),
 ('poly_ON_NET__TIGO', 181),
 ('poly_ZONE1__REGULARITY', 180),
 ('poly_ON_NET__REGULARITY__FREQ_TOP_PACK', 179),
 ('poly_DATA_VOLUME__ORANGE__REGULARITY', 176),
 ('poly_DATA_VOLUME__ZONE1__REGULARITY', 176),
 ('poly_ORANGE__TIGO__REGULARITY', 176),
 (

In [8]:
useful_features = ['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6']
df_test = df_test[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain['CHURN']
    yvalid = xvalid['CHURN']
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LogisticRegression()
#     model = SGDClassifier(random_state=42, loss='modified_huber')
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict_proba(xvalid)[:, 1]
    test_preds = model.predict_proba(xtest)[:, 1]
    final_predictions.append(test_preds)
    score = roc_auc_score(yvalid, preds_valid)
    print(fold, score)
    scores.append(score)

print(np.mean(scores), np.std(scores))

# 0 0.9315283221655729
# 1 0.9322252323181413
# 2 0.9313247129395837
# 3 0.9318919085786139
# 4 0.9307662596698618
# 0.9315472871343549 0.0004976497673210968
# 0.9303098065651516 0.0005268336328890778
# 0.9301957664933731 0.0004690483101817313

0 0.9303435598903239
1 0.9308821147690043
2 0.9299188150996149
3 0.930353629843727
4 0.9294754849669176
0.9301947209139175 0.00047187705320450883


In [9]:
sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("./data/submission-blending-7-predict-proba-logreg-poly-with-randforest-balanced-and-scgbd-and-nn.csv", index=False)

sample_submission.sample(7)

Unnamed: 0,user_id,CHURN
155861,69054c7bfacc336af62808b6e493808be9c266a6,0.012138
292285,c4e0af1fb37c59c4474d9aebc4924969b94e1726,0.288428
117994,4f63eeab1f2c43f5747d5068463215b283fb931b,0.803452
92615,3e5d5ec3d3ab1ef78af208d7abc4277b65bcd2a5,0.012211
127972,561fbcb2498f4d012c49d87cc83f3298867ecc08,0.710496
8518,0589d5d54d9e1c8b642a0db9a01d17fd499dd74d,0.028536
154758,6846e0ae95ef02513514ce0cd6ded6114cecdae2,0.045035


In [9]:
df_test.to_csv('./data/test_stack.csv', index=None)
df.to_csv('./data/train_stack.csv', index=None)

In [31]:
df[useful_features].corrwith(df['CHURN'])

pred_1    0.681863
pred_2    0.681743
pred_3    0.681602
dtype: float64

In [13]:
import optuna

def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain['CHURN']
    yvalid = xvalid['CHURN']

    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]

    model = XGBClassifier(
        random_state=42,
        n_estimators=7000,
        tree_method='gpu_hist',
        gpu_id=0,
        predictor="gpu_predictor",
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict_proba(xvalid)[:, 1]
    score = roc_auc_score(yvalid, preds_valid)
    return score

max_study = optuna.create_study(direction="maximize")
max_study.optimize(run, n_trials=10)
max_study.best_params

[32m[I 2021-09-17 08:48:15,928][0m A new study created in memory with name: no-name-d85b481e-ec2d-41a6-a53f-bf28125d09e1[0m


[0]	validation_0-logloss:0.68397
[1000]	validation_0-logloss:0.25175
[2000]	validation_0-logloss:0.25168
[2139]	validation_0-logloss:0.25168


[32m[I 2021-09-17 09:02:44,018][0m Trial 0 finished with value: 0.9314279260034156 and parameters: {'learning_rate': 0.013819612553795343, 'reg_lambda': 8.238107969664665e-07, 'reg_alpha': 2.9949105141104306, 'subsample': 0.26887197204914526, 'colsample_bytree': 0.6837796393182599, 'max_depth': 5}. Best is trial 0 with value: 0.9314279260034156.[0m


[0]	validation_0-logloss:0.62708
[595]	validation_0-logloss:0.25197


[32m[I 2021-09-17 09:06:32,578][0m Trial 1 finished with value: 0.9313139284633065 and parameters: {'learning_rate': 0.10452430533054569, 'reg_lambda': 0.0002638035777604412, 'reg_alpha': 3.42070538419488e-08, 'subsample': 0.6629989749924334, 'colsample_bytree': 0.37015213344628106, 'max_depth': 4}. Best is trial 0 with value: 0.9314279260034156.[0m


[0]	validation_0-logloss:0.66560
[587]	validation_0-logloss:0.25177


[32m[I 2021-09-17 09:12:04,408][0m Trial 2 finished with value: 0.9314845792838289 and parameters: {'learning_rate': 0.04190780140159328, 'reg_lambda': 2.7960994511206544e-07, 'reg_alpha': 5.196451506336081, 'subsample': 0.664064472000207, 'colsample_bytree': 0.6690510631891529, 'max_depth': 7}. Best is trial 2 with value: 0.9314845792838289.[0m


[0]	validation_0-logloss:0.67681
[1000]	validation_0-logloss:0.25224
[2000]	validation_0-logloss:0.25201
[3000]	validation_0-logloss:0.25193
[4000]	validation_0-logloss:0.25191
[4409]	validation_0-logloss:0.25190


[32m[I 2021-09-17 09:33:53,596][0m Trial 3 finished with value: 0.9312934582174985 and parameters: {'learning_rate': 0.026955594073180784, 'reg_lambda': 7.565149170946107, 'reg_alpha': 0.0016211720701268238, 'subsample': 0.37700374467215536, 'colsample_bytree': 0.25073557241975997, 'max_depth': 3}. Best is trial 2 with value: 0.9314845792838289.[0m


[0]	validation_0-logloss:0.59568
[519]	validation_0-logloss:0.25191


[32m[I 2021-09-17 09:37:35,522][0m Trial 4 finished with value: 0.931297057550825 and parameters: {'learning_rate': 0.1588485629882323, 'reg_lambda': 9.118618548483643e-08, 'reg_alpha': 15.798990078448904, 'subsample': 0.5083139429779769, 'colsample_bytree': 0.28556659904030074, 'max_depth': 4}. Best is trial 2 with value: 0.9314845792838289.[0m


[0]	validation_0-logloss:0.68134
[1000]	validation_0-logloss:0.25517
[2000]	validation_0-logloss:0.25460
[3000]	validation_0-logloss:0.25438
[4000]	validation_0-logloss:0.25424
[5000]	validation_0-logloss:0.25415
[6000]	validation_0-logloss:0.25409
[6999]	validation_0-logloss:0.25403


[32m[I 2021-09-17 10:05:46,551][0m Trial 5 finished with value: 0.9305181500389313 and parameters: {'learning_rate': 0.020906154456973868, 'reg_lambda': 77.86449852145682, 'reg_alpha': 3.314989165458843, 'subsample': 0.6634836204368635, 'colsample_bytree': 0.5927333777916868, 'max_depth': 1}. Best is trial 2 with value: 0.9314845792838289.[0m


[0]	validation_0-logloss:0.55556
[406]	validation_0-logloss:0.25263


[32m[I 2021-09-17 10:08:31,174][0m Trial 6 finished with value: 0.9311290437976708 and parameters: {'learning_rate': 0.23338235982941874, 'reg_lambda': 0.00853096431301829, 'reg_alpha': 5.2446891952302296e-08, 'subsample': 0.5719716839427655, 'colsample_bytree': 0.46502365909241583, 'max_depth': 4}. Best is trial 2 with value: 0.9314845792838289.[0m


[0]	validation_0-logloss:0.68350
[1000]	validation_0-logloss:0.25153
[1350]	validation_0-logloss:0.25156


[32m[I 2021-09-17 10:20:24,377][0m Trial 7 finished with value: 0.931501206040066 and parameters: {'learning_rate': 0.014461849398074727, 'reg_lambda': 0.08185850904776007, 'reg_alpha': 0.0001173486815850512, 'subsample': 0.7675905290878289, 'colsample_bytree': 0.2708299922996371, 'max_depth': 7}. Best is trial 7 with value: 0.931501206040066.[0m


[0]	validation_0-logloss:0.59614
[382]	validation_0-logloss:0.25252


[32m[I 2021-09-17 10:23:28,068][0m Trial 8 finished with value: 0.9313693084656419 and parameters: {'learning_rate': 0.1568646267537341, 'reg_lambda': 0.04418673161365856, 'reg_alpha': 7.463166011071637e-08, 'subsample': 0.9776135605922085, 'colsample_bytree': 0.6551332534982146, 'max_depth': 6}. Best is trial 7 with value: 0.931501206040066.[0m


[0]	validation_0-logloss:0.67659
[1000]	validation_0-logloss:0.25279
[2000]	validation_0-logloss:0.25235
[3000]	validation_0-logloss:0.25216
[4000]	validation_0-logloss:0.25207
[5000]	validation_0-logloss:0.25201
[6000]	validation_0-logloss:0.25197
[6999]	validation_0-logloss:0.25194


[32m[I 2021-09-17 10:56:59,374][0m Trial 9 finished with value: 0.9312814141933533 and parameters: {'learning_rate': 0.02611429212330279, 'reg_lambda': 7.740193767169773e-08, 'reg_alpha': 9.603949062949768e-06, 'subsample': 0.909329218400358, 'colsample_bytree': 0.32533920482730616, 'max_depth': 2}. Best is trial 7 with value: 0.931501206040066.[0m


{'learning_rate': 0.014461849398074727,
 'reg_lambda': 0.08185850904776007,
 'reg_alpha': 0.0001173486815850512,
 'subsample': 0.7675905290878289,
 'colsample_bytree': 0.2708299922996371,
 'max_depth': 7}

In [27]:
import optuna

def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain['CHURN']
    yvalid = xvalid['CHURN']

    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]

    model = LGBMClassifier(
        random_state=42,
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict_proba(xvalid)[:, 1]
    score = roc_auc_score(yvalid, preds_valid)
    return score

lgb_study = optuna.create_study(direction="maximize")
lgb_study.optimize(run, n_trials=10)
lgb_study.best_params

[32m[I 2021-09-17 05:25:57,446][0m A new study created in memory with name: no-name-66770598-610f-4ccb-b9fa-9be952104e81[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.251598
Early stopping, best iteration is:
[1123]	valid_0's binary_logloss: 0.251593


[32m[I 2021-09-17 05:33:51,958][0m Trial 0 finished with value: 0.9314600527915347 and parameters: {'learning_rate': 0.029253877255476443, 'reg_lambda': 16.09426889606859, 'reg_alpha': 0.014354120473120952, 'subsample': 0.43289663848783977, 'colsample_bytree': 0.5268279718406376, 'max_depth': 6}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.252491
[2000]	valid_0's binary_logloss: 0.252202
[3000]	valid_0's binary_logloss: 0.252093
[4000]	valid_0's binary_logloss: 0.252019
[5000]	valid_0's binary_logloss: 0.251987
[6000]	valid_0's binary_logloss: 0.251953
[7000]	valid_0's binary_logloss: 0.251946
Did not meet early stopping. Best iteration is:
[6832]	valid_0's binary_logloss: 0.25194


[32m[I 2021-09-17 05:44:27,503][0m Trial 1 finished with value: 0.931275846351409 and parameters: {'learning_rate': 0.052102739958502405, 'reg_lambda': 2.064403037052816e-06, 'reg_alpha': 13.32964018036091, 'subsample': 0.7876767148834206, 'colsample_bytree': 0.1434760512987632, 'max_depth': 2}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[672]	valid_0's binary_logloss: 0.251604


[32m[I 2021-09-17 05:47:51,918][0m Trial 2 finished with value: 0.9314570498337816 and parameters: {'learning_rate': 0.03969545724998376, 'reg_lambda': 0.20064902882483726, 'reg_alpha': 2.7950123598327548e-05, 'subsample': 0.9553194299676719, 'colsample_bytree': 0.2527763298987378, 'max_depth': 6}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.251722
Early stopping, best iteration is:
[1212]	valid_0's binary_logloss: 0.251717


[32m[I 2021-09-17 05:55:24,481][0m Trial 3 finished with value: 0.9313921758914712 and parameters: {'learning_rate': 0.03997837683293074, 'reg_lambda': 0.0009479166919384778, 'reg_alpha': 0.0001972240387767668, 'subsample': 0.4012843829544507, 'colsample_bytree': 0.6657857925222584, 'max_depth': 4}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[163]	valid_0's binary_logloss: 0.251708


[32m[I 2021-09-17 05:59:08,825][0m Trial 4 finished with value: 0.93139905026303 and parameters: {'learning_rate': 0.13634165486720964, 'reg_lambda': 17.372499824524326, 'reg_alpha': 1.6919695085951059, 'subsample': 0.2619815317643066, 'colsample_bytree': 0.7595539318306528, 'max_depth': 7}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.25312
[2000]	valid_0's binary_logloss: 0.252584
[3000]	valid_0's binary_logloss: 0.252386
[4000]	valid_0's binary_logloss: 0.252266
[5000]	valid_0's binary_logloss: 0.252187
[6000]	valid_0's binary_logloss: 0.252133
[7000]	valid_0's binary_logloss: 0.252097
Did not meet early stopping. Best iteration is:
[6993]	valid_0's binary_logloss: 0.252097


[32m[I 2021-09-17 06:12:50,056][0m Trial 5 finished with value: 0.9312065189787386 and parameters: {'learning_rate': 0.018886342577719966, 'reg_lambda': 3.460697663432346e-08, 'reg_alpha': 1.1043233999249435e-06, 'subsample': 0.9294011511438846, 'colsample_bytree': 0.2882018963204575, 'max_depth': 2}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.252351
[2000]	valid_0's binary_logloss: 0.251994
[3000]	valid_0's binary_logloss: 0.251864
[4000]	valid_0's binary_logloss: 0.251791
[5000]	valid_0's binary_logloss: 0.251758
[6000]	valid_0's binary_logloss: 0.251745
Early stopping, best iteration is:
[5788]	valid_0's binary_logloss: 0.251744


[32m[I 2021-09-17 06:32:05,049][0m Trial 6 finished with value: 0.9313767574272847 and parameters: {'learning_rate': 0.016369898118974643, 'reg_lambda': 0.37249844793334447, 'reg_alpha': 4.2723683773861226e-05, 'subsample': 0.6757142698228279, 'colsample_bytree': 0.48844441104124403, 'max_depth': 3}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[229]	valid_0's binary_logloss: 0.251682


[32m[I 2021-09-17 06:36:52,098][0m Trial 7 finished with value: 0.9314131087129021 and parameters: {'learning_rate': 0.12390417864338456, 'reg_lambda': 9.9374230921288e-05, 'reg_alpha': 5.7536170113519685, 'subsample': 0.44321371828479605, 'colsample_bytree': 0.9393652698808845, 'max_depth': 5}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.25488
[2000]	valid_0's binary_logloss: 0.254461
[3000]	valid_0's binary_logloss: 0.254282
[4000]	valid_0's binary_logloss: 0.254182
[5000]	valid_0's binary_logloss: 0.254109
[6000]	valid_0's binary_logloss: 0.254052
[7000]	valid_0's binary_logloss: 0.25401
Did not meet early stopping. Best iteration is:
[7000]	valid_0's binary_logloss: 0.25401


[32m[I 2021-09-17 06:57:07,449][0m Trial 8 finished with value: 0.9305310477543884 and parameters: {'learning_rate': 0.028644806959834304, 'reg_lambda': 1.2059356758857591e-05, 'reg_alpha': 0.07135766755254154, 'subsample': 0.8953026610476505, 'colsample_bytree': 0.6400658534745259, 'max_depth': 1}. Best is trial 0 with value: 0.9314600527915347.[0m


Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.252383
[2000]	valid_0's binary_logloss: 0.252032
[3000]	valid_0's binary_logloss: 0.251895
[4000]	valid_0's binary_logloss: 0.251833
[5000]	valid_0's binary_logloss: 0.251794
[6000]	valid_0's binary_logloss: 0.251784
[7000]	valid_0's binary_logloss: 0.251771
Did not meet early stopping. Best iteration is:
[7000]	valid_0's binary_logloss: 0.251771


[32m[I 2021-09-17 07:16:35,260][0m Trial 9 finished with value: 0.931365700047678 and parameters: {'learning_rate': 0.016098139450670325, 'reg_lambda': 0.0009521331758966834, 'reg_alpha': 3.073372177215249e-06, 'subsample': 0.43467529291256535, 'colsample_bytree': 0.4074526410751158, 'max_depth': 3}. Best is trial 0 with value: 0.9314600527915347.[0m


{'learning_rate': 0.029253877255476443,
 'reg_lambda': 16.09426889606859,
 'reg_alpha': 0.014354120473120952,
 'subsample': 0.43289663848783977,
 'colsample_bytree': 0.5268279718406376,
 'max_depth': 6}

In [14]:
import optuna

def run_cb(trial):
    fold = 0
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
#         "used_ram_limit": "3gb",
    }
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain['CHURN']
    yvalid = xvalid['CHURN']

    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]

    cb_model = CatBoostClassifier(**param)
    
    cb_model.fit(xtrain, ytrain, early_stopping_rounds=100, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = cb_model.predict_proba(xvalid)[:, 1]
    score = roc_auc_score(yvalid, preds_valid)
    return score

cb_study = optuna.create_study(direction="maximize")
cb_study.optimize(run_cb, n_trials=100, timeout=600)

print("Number of finished trials: {}".format(len(cb_study.trials)))

print("Best trial:")
trial = cb_study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    
cb_study.best_params

[32m[I 2021-09-17 14:37:49,878][0m A new study created in memory with name: no-name-fa2aa7b9-367d-407d-9412-7e8ea0147125[0m


Learning rate set to 0.199658
0:	learn: 0.5232490	test: 0.5233013	best: 0.5233013 (0)	total: 161ms	remaining: 2m 41s
999:	learn: 0.2513513	test: 0.2524707	best: 0.2524695 (992)	total: 2m 18s	remaining: 0us

bestTest = 0.2524694958
bestIteration = 992

Shrink model to first 993 iterations.


[32m[I 2021-09-17 14:40:26,340][0m Trial 0 finished with value: 0.9309947928435258 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.033347223522640496, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.22591244188174475}. Best is trial 0 with value: 0.9309947928435258.[0m


0:	learn: 0.6412610	test: 0.6413268	best: 0.6413268 (0)	total: 4.59s	remaining: 1h 16m 30s
999:	learn: 0.2471274	test: 0.2518213	best: 0.2518079 (929)	total: 1h 25m 42s	remaining: 0us

bestTest = 0.2518079151
bestIteration = 929

Shrink model to first 930 iterations.


[32m[I 2021-09-17 16:06:28,168][0m Trial 1 finished with value: 0.9313586960392259 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.054208119366927966, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9494580379034286}. Best is trial 1 with value: 0.9313586960392259.[0m


Number of finished trials: 2
Best trial:
  Value: 0.9313586960392259
  Params: 
    objective: CrossEntropy
    colsample_bylevel: 0.054208119366927966
    depth: 12
    boosting_type: Ordered
    bootstrap_type: Bernoulli
    subsample: 0.9494580379034286


{'objective': 'CrossEntropy',
 'colsample_bylevel': 0.054208119366927966,
 'depth': 12,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'Bernoulli',
 'subsample': 0.9494580379034286}

In [32]:
import optuna

def run_rf(trial: optuna.Trial):
    fold = 0
    params = {
        'max_depth': trial.suggest_int('rf_max_depth', 2, 32, log=True),
        'max_features': trial.suggest_categorical('rf_max_features', ["auto", "sqrt", "log2"]),
        'class_weight': trial.suggest_categorical('rf_class_weight', ['balanced', 'balanced_subsample', None])
    }
    

    xtrain = train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain['CHURN']
    yvalid = xvalid['CHURN']

    xtrain = xtrain[useful_cols]
    xvalid = xvalid[useful_cols]

    rf_model = RandomForestClassifier(
        n_estimators=100,
        n_jobs=-1,
        random_state=42,
        verbose=1,
        **params)
    
    rf_model.fit(xtrain, ytrain)
    
    preds_valid = rf_model.predict_proba(xvalid)[:, 1]
    score = roc_auc_score(yvalid, preds_valid)
    return score

rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(run_rf, n_trials=100, timeout=600)

print("Number of finished trials: {}".format(len(rf_study.trials)))

print("Best trial:")
trial = rf_study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    
rf_study.best_params

[32m[I 2021-09-18 21:35:17,788][0m A new study created in memory with name: no-name-58fbb78c-6066-4439-a36c-02ecbed6c38b[0m
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished
[32m[I 2021-09-18 21:36:42,148][0m Trial 0 finished with value: 0.9058284139202586 and parameters: {'rf_max_depth': 4, 'rf_max_features': 'log2', 'rf_class_weight': 'balanced'}. Best is trial 0 with value: 0.9058284139202586.[0m
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.6min finished
[P

Number of finished trials: 4
Best trial:
  Value: 0.9289267479994578
  Params: 
    rf_max_depth: 15
    rf_max_features: auto
    rf_class_weight: balanced_subsample


{'rf_max_depth': 15,
 'rf_max_features': 'auto',
 'rf_class_weight': 'balanced_subsample'}

In [23]:
# vc = train['CHURN'].value_counts()
# vc[0], vc[1]*4.33

(1750062, 1749259.3800000001)