In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from lightgbm import LGBMClassifier
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
import re

tqdm.pandas()

In [2]:
#import data
df_train = pd.read_csv('./data/Train_folds.zip')
df_test=  pd.read_csv('./data/Test.zip')
submission = pd.read_csv('./data/SampleSubmission.csv')

In [3]:
train = df_train.copy()
test = df_test.copy()

cat_cols = [
    'REGION',
#    'TENURE',
#     'TOP_PACK'
]

num_cols = [
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
]

target = 'CHURN'

mapping = {
    'D 3-6 month': 1,
    'E 6-9 month': 2,
    'F 9-12 month': 3,
    'G 12-15 month': 4,
    'H 15-18 month': 5,
    'I 18-21 month': 6,
    'J 21-24 month': 7,
    'K > 24 month': 8,
}

mapping = {
    'D 3-6 month': 1,
    'E 6-9 month': 1,
    'F 9-12 month': 2,
    'G 12-15 month': 2,
    'H 15-18 month': 3,
    'I 18-21 month': 3,
    'J 21-24 month': 4,
    'K > 24 month': 4,
}

train['TENURE'] = train['TENURE'].map(mapping) 
test['TENURE'] = test['TENURE'].map(mapping)

train['REGION'] = train['REGION'].fillna('OTHER')
test['REGION'] = test['REGION'].fillna('OTHER')

train['TOP_PACK'] = train['TOP_PACK'].fillna('OTHER')
test['TOP_PACK'] = test['TOP_PACK'].fillna('OTHER')

agg_by_tenure = pd.read_csv('./data/agg_by_tenure.csv')
agg_by_tenure_dict = {x['TENURE']: x for x in agg_by_tenure.to_dict('records')}

agg_by_region = pd.read_csv('./data/agg_by_region.csv')
agg_by_region_dict = {x['REGION']: x for x in agg_by_region.to_dict('records')}


def by_region(x: pd.Series, col: str, how: str) -> float:
    return agg_by_region_dict[x['REGION']][f'{col}_{how}'] if np.isnan(x[f'{col}']) else x[f'{col}']

def by_tenure(x: pd.Series, col: str, how: str) -> float:
    return agg_by_tenure_dict[x['TENURE']][f'{col}_{how}'] if np.isnan(x[f'{col}']) else x[f'{col}']

train_merged_with_tenure = pd.merge(train, agg_by_tenure, left_on='TENURE', right_on='TENURE', how='left')
test_merged_with_tenure = pd.merge(test, agg_by_tenure, left_on='TENURE', right_on='TENURE', how='left')

train_merged_with_region = pd.merge(train, agg_by_region, left_on='REGION', right_on='REGION', how='left')
test_merged_with_region = pd.merge(test, agg_by_region, left_on='REGION', right_on='REGION', how='left')

for col in tqdm(num_cols):
    col_mean = train[col].mean()
    # train[col] = train[col].fillna(col_mean)
    # test[col] = test[col].fillna(col_mean)
    # train[col] = train.apply(lambda x: by_tenure(x, col, 'mean'), axis=1) 
    # test[col] = test.apply(lambda x: by_tenure(x, col, 'mean'), axis=1)
    train[col] = np.where(train[col].isnull(), train_merged_with_tenure[f'{col}_mean'], train[col])
    test[col] = np.where(test[col].isnull(), test_merged_with_tenure[f'{col}_mean'], test[col])

for col in tqdm(['DATA_VOLUME', 'ON_NET']): # судя по весам фич только они влияют
#     train[f'{col}_FNAN_REGION_mean'] = train.apply(lambda x: by_region(x, col, 'mean'), axis=1)
#     test[f'{col}_FNAN_REGION_mean'] = test.apply(lambda x: by_region(x, col, 'mean'), axis=1)
    train[f'{col}_FNAN_REGION_mean'] = np.where(train[col].isnull(), train_merged_with_region[f'{col}_mean'], train[col])
    test[f'{col}_FNAN_REGION_mean'] = np.where(test[col].isnull(), test_merged_with_region[f'{col}_mean'], test[col])

train['IS_UNLIMITED'] = train['TOP_PACK'].apply(lambda x: 1 if 'unlimited' in x.lower() else 0)
test['IS_UNLIMITED'] = test['TOP_PACK'].apply(lambda x: 1 if 'unlimited' in x.lower() else 0)

for tp in tqdm(['OTHER', 'Data:50F=30MB_24H', 'Data:3000F=10GB,30d', 'Data:500F=2GB,24H', 'Data:300F=100MB,2d', 'Data:1000F=5GB,7d', 'Data:150F=SPPackage1,24H'], desc='top pack'):
    col_name = re.sub('[:=,]', '_', tp)
    train[f'IS_TOP_PACK_{col_name}'] = train['TOP_PACK'].apply(lambda x: 1 if x == tp else 0)
    test[f'IS_TOP_PACK_{col_name}'] = test['TOP_PACK'].apply(lambda x: 1 if x == tp else 0)

train['IS_ALL_NET'] = train['TOP_PACK'].apply(lambda x: 1 if 'allnet' in x.replace('-', '').lower() else 0)
test['IS_ALL_NET'] = test['TOP_PACK'].apply(lambda x: 1 if 'allnet' in x.replace('-', '').lower() else 0)


# target encoding
te_region = train.groupby('REGION').agg({'CHURN': 'mean'}).reset_index()
te_region.columns = ['REGION', 'TARGET_ENC_REGION']

te_tenure = train.groupby('TENURE').agg({'CHURN': 'mean'}).reset_index()
te_tenure.columns = ['TENURE', 'TARGET_ENC_TENURE']

train = pd.merge(train, 
                 te_region,
                 left_on='REGION',
                 right_on='REGION',
                 how='left',
                 suffixes=('_x', '_TARGET_ENC_REGION'))
test =  pd.merge(test, 
                 te_region,
                 left_on='REGION',
                 right_on='REGION',
                 how='left',
                 suffixes=('_x', '_TARGET_ENC_REGION'))

train = pd.merge(train, 
                 te_tenure,
                 left_on='TENURE',
                 right_on='TENURE',
                 how='left',
                 suffixes=('_x', '_TARGET_ENC_TENURE'))
test =  pd.merge(test, 
                 te_tenure,
                 left_on='TENURE',
                 right_on='TENURE',
                 how='left',
                 suffixes=('_x', '_TARGET_ENC_TENURE'))


# не нужон - по весам фич
#     train[f'{col}_FNAN_REGION_median'] = train.apply(lambda x: by_region(x, col, 'median'), axis=1)
#     test[f'{col}_FNAN_REGION_median'] = test.apply(lambda x: by_region(x, col, 'median'), axis=1)
    
#     train[f'{col}_FNAN_TENURE_mean'] = train.apply(lambda x: by_tenure(x, col, 'mean'), axis=1)
#     test[f'{col}_FNAN_TENURE_mean'] = test.apply(lambda x: by_tenure(x, col, 'mean'), axis=1)

#     train[f'{col}_FNAN_TENURE_median'] = train.apply(lambda x: by_tenure(x, col, 'median'), axis=1)
#     test[f'{col}_FNAN_TENURE_median'] = test.apply(lambda x: by_tenure(x, col, 'median'), axis=1)

train

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

top pack:   0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,IS_TOP_PACK_OTHER,IS_TOP_PACK_Data_50F_30MB_24H,IS_TOP_PACK_Data_3000F_10GB_30d,IS_TOP_PACK_Data_500F_2GB_24H,IS_TOP_PACK_Data_300F_100MB_2d,IS_TOP_PACK_Data_1000F_5GB_7d,IS_TOP_PACK_Data_150F_SPPackage1_24H,IS_ALL_NET,TARGET_ENC_REGION,TARGET_ENC_TENURE
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,4,4250.000000,15.000000,4251.000000,1417.00000,17.000000,4.000000,388.000000,...,0,0,0,0,0,0,0,0,0.014196,0.183531
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,OTHER,3,5067.795106,11.478503,5012.078888,1670.69839,14.031420,3705.837220,267.946292,...,1,0,0,0,0,0,0,0,0.447987,0.270341
2,00001654a9d9f96303d9969d0a4a851714a4bb57,OTHER,4,3600.000000,2.000000,1020.000000,340.00000,2.000000,3317.913239,90.000000,...,0,0,0,0,0,0,0,0,0.447987,0.183531
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,4,13500.000000,15.000000,13502.000000,4501.00000,18.000000,43804.000000,41.000000,...,0,0,0,0,0,1,0,0,0.019235,0.183531
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,4,1000.000000,1.000000,985.000000,328.00000,1.000000,3317.913239,39.000000,...,0,0,0,0,0,0,0,0,0.019235,0.183531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154043,ffffe85215ddc71a84f95af0afb0deeea90e6967,OTHER,4,5551.330713,11.535439,5531.026497,1843.68163,13.982138,3317.913239,278.320853,...,1,0,0,0,0,0,0,0,0.447987,0.183531
2154044,ffffeaaa9289cdba0ac000f0ab4b48f4aa74ed15,THIES,4,6100.000000,15.000000,5800.000000,1933.00000,15.000000,621.000000,26.000000,...,0,0,0,0,0,0,0,0,0.016301,0.183531
2154045,fffff172fda1b4bb38a95385951908bb92379809,OTHER,4,5551.330713,11.535439,5531.026497,1843.68163,13.982138,3317.913239,278.320853,...,1,0,0,0,0,0,0,0,0.447987,0.183531
2154046,fffff5911296937a37f09a37a549da2e0dad6dbb,THIES,4,10000.000000,11.000000,7120.000000,2373.00000,13.000000,3317.913239,0.000000,...,0,0,0,0,0,0,0,1,0.016301,0.183531


In [15]:
train.columns

Index(['user_id', 'REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE',
       'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO',
       'ZONE1', 'ZONE2', 'MRG', 'REGULARITY', 'TOP_PACK', 'FREQ_TOP_PACK',
       'CHURN', 'kfold', 'DATA_VOLUME_FNAN_REGION_mean',
       'ON_NET_FNAN_REGION_mean', 'IS_UNLIMITED', 'IS_TOP_PACK_OTHER',
       'IS_TOP_PACK_Data_50F_30MB_24H', 'IS_TOP_PACK_Data_3000F_10GB_30d',
       'IS_TOP_PACK_Data_500F_2GB_24H', 'IS_TOP_PACK_Data_300F_100MB_2d',
       'IS_TOP_PACK_Data_1000F_5GB_7d', 'IS_TOP_PACK_Data_150F_SPPackage1_24H',
       'IS_ALL_NET', 'TARGET_ENC_REGION', 'TARGET_ENC_TENURE'],
      dtype='object')

In [17]:
# train_copy = train.copy()
# test_copy = test.copy()

# useful_cols = [col for col in train_copy.columns if col not in set(['user_id', 
#                                                                     'MRG',
#                                                                     'TOP_PACK', 
#                                                                     'CHURN',
#                                                                     'kfold'])]

# final_predictions = []
# scores = []

# minmax_scaler_cols = ['DATA_VOLUME', 'ON_NET']
# scaler = MinMaxScaler()
# train_copy[minmax_scaler_cols] = scaler.fit_transform(train_copy[minmax_scaler_cols])
# test_copy[minmax_scaler_cols] = scaler.transform(test_copy[minmax_scaler_cols])

# standard_scaler_cols = [col for col in train_copy.columns if col not in set(['user_id', 'MRG', 'TOP_PACK', 'REGION', 'DATA_VOLUME', 'ON_NET', 'kfold', 'CHURN'])]
# scaler = StandardScaler()
# train_copy[standard_scaler_cols] = scaler.fit_transform(train_copy[standard_scaler_cols])
# test_copy[standard_scaler_cols] = scaler.transform(test_copy[standard_scaler_cols])

# # poly features
# numerical_cols = [
#     'DATA_VOLUME',
#     'ON_NET',
# # 'MONTANT',
# # 'FREQUENCE_RECH',
# # 'REVENUE',
# # 'ARPU_SEGMENT',
# # 'FREQUENCE',
#  'ORANGE',
#  'TIGO',
# # 'ZONE1',
# # 'ZONE2',
# # 'REGULARITY',
# ]
# poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
# train_poly = poly.fit_transform(train_copy[numerical_cols])
# test_poly = poly.fit_transform(test_copy[numerical_cols])

# poly_columns = [f"poly_{i}" for i in range(train_poly.shape[1])]
# df_poly = pd.DataFrame(train_poly, columns=poly_columns)
# df_test_poly = pd.DataFrame(test_poly, columns=poly_columns)

# train_copy = pd.concat([train_copy, df_poly], axis=1)
# test_copy = pd.concat([test_copy, df_test_poly], axis=1)

# useful_cols += poly_columns

# for cat_col in cat_cols:
#     encoder = OneHotEncoder(handle_unknown='ignore')
#     unique_values = train_copy[cat_col].unique()

#     one_hot_encoded_cols = [f'{cat_col}_{i}' for i in range(len(unique_values))]
    
#     ohe_df = pd.DataFrame(encoder.fit_transform(train_copy[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
#     ohe_df.index = train_copy.index
#     train_copy = train_copy.drop(cat_col, axis=1)
#     train_copy = pd.concat([train_copy, ohe_df], axis=1)        
#     print(f'[{cat_col}] xtrain transformed')

#     ohe_df = pd.DataFrame(encoder.transform(test_copy[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
#     ohe_df.index = test_copy.index
#     test_copy = test_copy.drop(cat_col, axis=1)
#     test_copy = pd.concat([test_copy, ohe_df], axis=1)
#     print(f'[{cat_col}] xtest transformed')
    
#     useful_cols += one_hot_encoded_cols
#     useful_cols.remove(cat_col)

final_predictions = []
scores = []

target = 'CHURN'

for fold in tqdm(range(5), 'folds'):
    xtrain = train_copy[train_copy['kfold'] != fold][useful_cols]
    ytrain = train_copy[train_copy['kfold'] != fold][target]
    
    xvalid = train_copy[train['kfold'] == fold][useful_cols]
    yvalid = train_copy[train['kfold'] == fold][target]

    xtest = test_copy[useful_cols]

#     model = LGBMClassifier(
#         n_estimators=1000,
#         random_state=42,
#     )
    model = LGBMClassifier(
        n_estimators=7000,
        random_state=42,
        **{
            'learning_rate': 0.023262668329845724,
            'reg_lambda': 8.946573985262771e-05,
            'reg_alpha': 8.609876549670105e-06,
            'subsample': 0.3971516543340211,
            'colsample_bytree': 0.32107183361408465,
            'max_depth': 1
        }
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=False)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)

print(np.mean(scores), np.std(scores))

# 0.7696186855892555 0.00043098014634677365 fillna через REGION
# 0.7694106138854746 0.000588962476667447 fillna через TENURE
# 0.7701952731313458 0.0006247026357018513 fillna через mean по колонке
# 0.769901864294696 0.000610047154013374 fillna через mean по колонке & ohe for region
# 0.769901864294696 0.000610047154013374 fillna через mean по колонке & ohe for region + StandardScaler for Tenure (same as prev)
# 0.770010208315598 0.0005090535431930928 fillna через mean по колонке & ohe for region & tenure [private: 0.853185019447002]
# 0.770008209799155 0.0005120012284348264 fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds
# 0.7701761776716778 0.0007820978999800708 fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten [private 0.862163874472749]
# 0.7690741534134118 0.0008824035170153271 fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==)
# 0.7690825678568933 0.0008789684650258934  fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==)
# 0.7698245780943753 0.000724931679586648 lgb - fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==)
# 0.7694104784482177 0.0013440865822247955 lgb - fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==) + optuna params [0.760814445393041]
# 0.7696374297117283 0.0010216807303939667 lgb - fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==) + num cols log1

# 0.768700101552386 0.0010140633965792358 lgb + ohe for region + w/o tenure
# 0.7913442391232364 0.0009072160914931118 lgb + ohe for region + w/o tenure + region fillna other
# 0.7915846676604728 0.000801936503179641 lgb + ohe for region + 3 mon tenure + region fillna other
# 0.7915156853124097 0.000992492975077104 lgb + ohe for region + 12 mon tenure + region fillna other
# 0.7917672724064511 0.0008976053140583513 lgb + ohe for region + 6 mon tenure + region fillna other
# 0.7917653651343787 0.0007855762351865283 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num
# 0.7918252728749173 0.0008695062597926666 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num + StSc whole ds
# 0.7914845860192667 0.0010262092086456543 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num + MinMaxSc whole ds
# 0.7919582623977296 0.0008726366301281235 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other)
# 0.7913209118545288 0.0010748640472510074 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean region + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + DATA_VOLUME_FNAN_REGION_mean & ON_NET_FNAN_REGION_mean
# 0.792256907136914 0.0010776815408567177  lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) [0.795227047440307]
# 0.7916985286519639 0.0009710673470586245 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + TOP_PACK (scaled)
# 0.7915685818170114 0.0008091780308036663 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + TOP_PACK (ohe)
# 0.7915685818170114 0.0008091780308036663 (ok) lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + TOP_PACK (fillna other + ohe)
# 0.7916914872303236 0.0007342967833541975 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + TOP_PACK (fillna other + ohe) + IS_UNLIMITED
# 0.7919668892976947 0.0009393373192820406 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + IS_UNLIMITED
# 0.7916230116088253 0.0009519019905584034 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + IS_UNLIMITED + IS_TOP_PACK_NAN
# 0.791875399477451 0.0009425233968772518 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + IS_UNLIMITED + IS_TOP_PACK_NAN + is_all_net
# [86890349-b4ba-4f56-932b-d9445ff4d4f5] 0.791692177101185 0.0009334681681677956 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + IS_UNLIMITED + IS_TOP_PACK_NAN + is_all_net + top of top_pack
# [b624b4d3-3845-4d17-8648-694d2c7de821] 0.7919930343268513 0.0008727219454338667 lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + IS_UNLIMITED + IS_TOP_PACK_NAN + is_all_net + top of top_pack + te by reg & ten [0.795441938635621]
# [94863b76-a609-4faa-9294-141455a761bc] 0.7922070116912214 0.0009209575881898701  lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + IS_UNLIMITED + IS_TOP_PACK_NAN + is_all_net + top of top_pack + te by reg & ten + poly 'DATA_VOLUME','ON_NET','ORANGE', 'TIGO', [0.795539929288825]
# [79fc566c-55c8-4a59-9a7e-69e3c6789bed] 0.7973784391700554 0.0010272158066749583  lgb + ohe for region + 6 mon tenure + region fillna other + fillna num by mean tenure + MinMaxSc(DATA_VOLUME,ON_NET) + StSc(other) + IS_UNLIMITED + IS_TOP_PACK_NAN + is_all_net + top of top_pack + te by reg & ten + poly 'DATA_VOLUME','ON_NET','ORANGE', 'TIGO' + optuna []

folds:   0%|          | 0/5 [00:00<?, ?it/s]

0 0.7979141115063451
1 0.7980816317132172
2 0.7966558287337555
3 0.7985057742891991
4 0.7957348496077602
0.7973784391700554 0.0010272158066749583


In [18]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

submission = pd.read_csv('./data/SampleSubmission.csv')
submission.CHURN = preds
# submission.to_csv("./data/submission-lgb-5-folds-1000-est-42-rs-TENURE-4-REGION-fillna-Other-REG-OHE-StScaler&MinMixSc-fillna-by-mean-Tenure.csv", index=False)
submission.to_csv('./data/submission-94863b76-a609-4faa-9294-141455a761bc.csv', index=False) 

In [None]:
sorted(dict(zip(model.feature_name_, model.feature_importances_)).items(), key=lambda x: -x[1] )

In [None]:
from IPython.display import display, Markdown
from collections import defaultdict

# display(Markdown('**_some_ markdown** and an [internal reference](use/format/markdown)!'))

top_pack_values = train['TOP_PACK'].value_counts().index # train['TOP_PACK'].unique()
percents = defaultdict(list)
for tp in tqdm(top_pack_values):
    tmp = train[train['TOP_PACK'] == tp][['TOP_PACK', 'CHURN']]
    total = len(tmp)
    display(Markdown(f'**{tp} ({total})**'))
    vc = tmp.value_counts()
    try:
        percents[tp].append(total)
        try:
            percents[tp].append(vc[0])
        except:
            percents[tp].append(0)
        
        try:
            percents[tp].append(vc[1])
        except:
            percents[tp].append(0)
        
        percents[tp].append(percents[tp][-1]/percents[tp][-2] if percents[tp][-2] != 0 else 0)
        display(Markdown(f'0: {vc[0]}\t1: {vc[1]}\t({vc[1]/vc[0]*100}%)'))
    except Exception as e:
        print(e)
        print(tmp)

In [None]:
# train_copy = train.copy()
# test_copy = test.copy()

# useful_cols = [col for col in train_copy.columns if col not in set(['user_id', 
#                                                                     'MRG',
#                                                                     'TOP_PACK', 
#                                                                     'CHURN',
#                                                                     'kfold'])]

# final_predictions = []
# scores = []

# minmax_scaler_cols = ['DATA_VOLUME', 'ON_NET']
# scaler = MinMaxScaler()
# train_copy[minmax_scaler_cols] = scaler.fit_transform(train_copy[minmax_scaler_cols])
# test_copy[minmax_scaler_cols] = scaler.transform(test_copy[minmax_scaler_cols])

# standard_scaler_cols = [col for col in train_copy.columns if col not in set(['user_id', 'MRG', 'TOP_PACK', 'REGION', 'DATA_VOLUME', 'ON_NET', 'kfold', 'CHURN'])]
# scaler = StandardScaler()
# train_copy[standard_scaler_cols] = scaler.fit_transform(train_copy[standard_scaler_cols])
# test_copy[standard_scaler_cols] = scaler.transform(test_copy[standard_scaler_cols])

# # poly features
# numerical_cols = [
#     'DATA_VOLUME',
#     'ON_NET',
# # 'MONTANT',
# # 'FREQUENCE_RECH',
# # 'REVENUE',
# # 'ARPU_SEGMENT',
# # 'FREQUENCE',
#  'ORANGE',
#  'TIGO',
# # 'ZONE1',
# # 'ZONE2',
# # 'REGULARITY',
# ]
# poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
# train_poly = poly.fit_transform(train_copy[numerical_cols])
# test_poly = poly.fit_transform(test_copy[numerical_cols])

# poly_columns = [f"poly_{i}" for i in range(train_poly.shape[1])]
# df_poly = pd.DataFrame(train_poly, columns=poly_columns)
# df_test_poly = pd.DataFrame(test_poly, columns=poly_columns)

# train_copy = pd.concat([train_copy, df_poly], axis=1)
# test_copy = pd.concat([test_copy, df_test_poly], axis=1)

# useful_cols += poly_columns

# for cat_col in cat_cols:
#     encoder = OneHotEncoder(handle_unknown='ignore')
#     unique_values = train_copy[cat_col].unique()

#     one_hot_encoded_cols = [f'{cat_col}_{i}' for i in range(len(unique_values))]
    
#     ohe_df = pd.DataFrame(encoder.fit_transform(train_copy[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
#     ohe_df.index = train_copy.index
#     train_copy = train_copy.drop(cat_col, axis=1)
#     train_copy = pd.concat([train_copy, ohe_df], axis=1)        
#     print(f'[{cat_col}] xtrain transformed')

#     ohe_df = pd.DataFrame(encoder.transform(test_copy[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
#     ohe_df.index = test_copy.index
#     test_copy = test_copy.drop(cat_col, axis=1)
#     test_copy = pd.concat([test_copy, ohe_df], axis=1)
#     print(f'[{cat_col}] xtest transformed')
    
#     useful_cols += one_hot_encoded_cols
#     useful_cols.remove(cat_col)

# final_predictions = []
# scores = []

# target = 'CHURN'

# for fold in tqdm(range(5), 'folds'):
#     xtrain = train_copy[train_copy['kfold'] != fold][useful_cols]
#     ytrain = train_copy[train_copy['kfold'] != fold][target]
    
#     xvalid = train_copy[train['kfold'] == fold][useful_cols]
#     yvalid = train_copy[train['kfold'] == fold][target]

#     xtest = test_copy[useful_cols]

#     model = LGBMClassifier(
#         n_estimators=1000,
#         random_state=42,
#     )
#     model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=False)
    
#     preds_valid = model.predict(xvalid)
#     test_preds = model.predict(xtest)
#     final_predictions.append(test_preds)
#     score = roc_auc_score(yvalid, preds_valid)
#     scores.append(score)
    
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = train_copy[train_copy.kfold != fold][useful_cols]
    xvalid = train_copy[train_copy.kfold == fold][useful_cols]

    ytrain = train_copy[train_copy.kfold != fold]['CHURN']
    yvalid = train_copy[train_copy.kfold == fold]['CHURN']

    model = LGBMClassifier(
        random_state=42,
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    score = roc_auc_score(yvalid, preds_valid)
    return score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=10)

In [None]:
study.best_params

In [23]:
from xgboost import XGBClassifier

def run_xgb_optuna(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = train_copy[train_copy.kfold != fold][useful_cols]
    xvalid = train_copy[train_copy.kfold == fold][useful_cols]

    ytrain = train_copy[train_copy.kfold != fold]['CHURN']
    yvalid = train_copy[train_copy.kfold == fold]['CHURN']

    model = XGBClassifier(
        random_state=42,
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    score = roc_auc_score(yvalid, preds_valid)
    return score

In [27]:
xgb_study = optuna.create_study(direction="minimize")
xgb_study.optimize(run_xgb_optuna, n_trials=10)
print(xgb_study.best_params)

[32m[I 2021-09-11 01:27:10,203][0m A new study created in memory with name: no-name-665d20d9-61dc-460b-b848-db457a3a50f4[0m


[0]	validation_0-logloss:0.68149
[1000]	validation_0-logloss:0.25379
[2000]	validation_0-logloss:0.25319
[3000]	validation_0-logloss:0.25299
[4000]	validation_0-logloss:0.25289
[5000]	validation_0-logloss:0.25277
[6000]	validation_0-logloss:0.25271
[6999]	validation_0-logloss:0.25265


[32m[I 2021-09-11 03:01:33,334][0m Trial 0 finished with value: 0.7954320714993549 and parameters: {'learning_rate': 0.018907594739771442, 'reg_lambda': 0.08026291453520452, 'reg_alpha': 4.836923798243186e-08, 'subsample': 0.3938610342201946, 'colsample_bytree': 0.5068260136551423, 'max_depth': 2}. Best is trial 0 with value: 0.7954320714993549.[0m


[0]	validation_0-logloss:0.58525
[354]	validation_0-logloss:0.25377


[32m[I 2021-09-11 03:13:47,418][0m Trial 1 finished with value: 0.7941735109926839 and parameters: {'learning_rate': 0.17664658289943563, 'reg_lambda': 0.0004752770266892279, 'reg_alpha': 0.057120664597860804, 'subsample': 0.1751141830068263, 'colsample_bytree': 0.9339586497539522, 'max_depth': 5}. Best is trial 1 with value: 0.7941735109926839.[0m


[0]	validation_0-logloss:0.65560
[675]	validation_0-logloss:0.25177


[32m[I 2021-09-11 03:47:16,069][0m Trial 2 finished with value: 0.7927349963264996 and parameters: {'learning_rate': 0.05746047770798967, 'reg_lambda': 0.9292007699948006, 'reg_alpha': 3.164644611685049e-05, 'subsample': 0.7202855688350371, 'colsample_bytree': 0.902286297275291, 'max_depth': 6}. Best is trial 2 with value: 0.7927349963264996.[0m


[0]	validation_0-logloss:0.68712
[1000]	validation_0-logloss:0.25731
[2000]	validation_0-logloss:0.25600
[3000]	validation_0-logloss:0.25566
[4000]	validation_0-logloss:0.25551
[5000]	validation_0-logloss:0.25541
[6000]	validation_0-logloss:0.25537
[6999]	validation_0-logloss:0.25534


[32m[I 2021-09-11 05:00:24,304][0m Trial 3 finished with value: 0.7983046424621432 and parameters: {'learning_rate': 0.010041245886806825, 'reg_lambda': 5.016153632857239, 'reg_alpha': 0.033617106775221055, 'subsample': 0.17184616927494392, 'colsample_bytree': 0.7555828926003856, 'max_depth': 1}. Best is trial 2 with value: 0.7927349963264996.[0m


[0]	validation_0-logloss:0.60036
[502]	validation_0-logloss:0.25233


[32m[I 2021-09-11 05:18:04,196][0m Trial 4 finished with value: 0.7936248700025921 and parameters: {'learning_rate': 0.1500612420693415, 'reg_lambda': 0.00019797127807751045, 'reg_alpha': 0.0002144005512273368, 'subsample': 0.6433310356635589, 'colsample_bytree': 0.7375072273334858, 'max_depth': 5}. Best is trial 2 with value: 0.7927349963264996.[0m


[0]	validation_0-logloss:0.67750
[1000]	validation_0-logloss:0.25242
[2000]	validation_0-logloss:0.25209
[3000]	validation_0-logloss:0.25203
[3067]	validation_0-logloss:0.25203


[32m[I 2021-09-11 06:18:09,835][0m Trial 5 finished with value: 0.7939694871305556 and parameters: {'learning_rate': 0.024781603552755917, 'reg_lambda': 3.3064961948530582, 'reg_alpha': 2.6114243493137353e-05, 'subsample': 0.47137682114260615, 'colsample_bytree': 0.18761539248810677, 'max_depth': 5}. Best is trial 2 with value: 0.7927349963264996.[0m


[0]	validation_0-logloss:0.61438
[607]	validation_0-logloss:0.25216


[32m[I 2021-09-11 06:39:16,995][0m Trial 6 finished with value: 0.7931673728683969 and parameters: {'learning_rate': 0.1258680215182785, 'reg_lambda': 0.008220970922931697, 'reg_alpha': 7.158816638550568e-08, 'subsample': 0.8226281967433343, 'colsample_bytree': 0.6920058027399136, 'max_depth': 5}. Best is trial 2 with value: 0.7927349963264996.[0m


[0]	validation_0-logloss:0.68379
[1000]	validation_0-logloss:0.25457
[2000]	validation_0-logloss:0.25341
[3000]	validation_0-logloss:0.25309
[4000]	validation_0-logloss:0.25293
[5000]	validation_0-logloss:0.25278
[6000]	validation_0-logloss:0.25272
[6999]	validation_0-logloss:0.25266


[32m[I 2021-09-11 08:00:22,070][0m Trial 7 finished with value: 0.7950448515675372 and parameters: {'learning_rate': 0.015410942593942921, 'reg_lambda': 0.1749725455845635, 'reg_alpha': 60.7302175039379, 'subsample': 0.45616195291634365, 'colsample_bytree': 0.3813325494194799, 'max_depth': 2}. Best is trial 2 with value: 0.7927349963264996.[0m


[0]	validation_0-logloss:0.65328
[751]	validation_0-logloss:0.25183


[32m[I 2021-09-11 08:24:43,836][0m Trial 8 finished with value: 0.7923984926440721 and parameters: {'learning_rate': 0.06492027716998955, 'reg_lambda': 9.574177772304593e-05, 'reg_alpha': 0.394796743340963, 'subsample': 0.7396657363246338, 'colsample_bytree': 0.6147305790208815, 'max_depth': 6}. Best is trial 8 with value: 0.7923984926440721.[0m


[0]	validation_0-logloss:0.67260
[1000]	validation_0-logloss:0.25206
[2000]	validation_0-logloss:0.25193
[2985]	validation_0-logloss:0.25193


[32m[I 2021-09-11 09:31:08,580][0m Trial 9 finished with value: 0.7942637781369735 and parameters: {'learning_rate': 0.03132064404372426, 'reg_lambda': 1.1176152679148211e-06, 'reg_alpha': 0.0003264218372696942, 'subsample': 0.5930600752785693, 'colsample_bytree': 0.9085543387896339, 'max_depth': 4}. Best is trial 8 with value: 0.7923984926440721.[0m


{'learning_rate': 0.06492027716998955, 'reg_lambda': 9.574177772304593e-05, 'reg_alpha': 0.394796743340963, 'subsample': 0.7396657363246338, 'colsample_bytree': 0.6147305790208815, 'max_depth': 6}


In [28]:
# xgb_params = {
#     'learning_rate': 0.012077288295042267,
#     'reg_lambda': 0.0006873465542426026,
#     'reg_alpha': 1.3212946403386152e-06,
#     'subsample': 0.7581724871412163,
#     'colsample_bytree': 0.609065197494544,
#     'max_depth': 2
# }
xgb_params = {'learning_rate': 0.06492027716998955, 'reg_lambda': 9.574177772304593e-05, 'reg_alpha': 0.394796743340963, 'subsample': 0.7396657363246338, 'colsample_bytree': 0.6147305790208815, 'max_depth': 6}

final_predictions = []
scores = []

target = 'CHURN'

for fold in tqdm(range(5), 'folds'):
    xtrain = train_copy[train_copy['kfold'] != fold][useful_cols]
    ytrain = train_copy[train_copy['kfold'] != fold][target]
    
    xvalid = train_copy[train['kfold'] == fold][useful_cols]
    yvalid = train_copy[train['kfold'] == fold][target]

    xtest = test_copy[useful_cols]

#     model = LGBMClassifier(
#         n_estimators=1000,
#         random_state=42,
#     )
    xgb_model = XGBClassifier(
        n_estimators=7000,
        random_state=42,
        **xgb_params
    )
    xgb_model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=False)
    
    preds_valid = xgb_model.predict(xvalid)
    test_preds = xgb_model.predict(xtest)
    final_predictions.append(test_preds)
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)

print(np.mean(scores), np.std(scores))

# [6e843db7-46d0-4d55-9b7f-225d5b022227] 0.7920062707849765 0.000711474747631675 - xgb - new ds - optuna minimize

folds:   0%|          | 0/5 [00:00<?, ?it/s]



0 0.7923984926440721




1 0.7921119240890732




2 0.7917168672681814




3 0.7929651662720968




4 0.7908389036514595
0.7920062707849765 0.000711474747631675


In [29]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

submission = pd.read_csv('./data/SampleSubmission.csv')
submission.CHURN = preds
# [befcaa18-90a9-4a54-a901-642c6bf6fe5c] xgb - new ds - optuna
submission.to_csv('./data/submission-xgb-6e843db7-46d0-4d55-9b7f-225d5b022227.csv', index=False) 