In [16]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from lightgbm import LGBMClassifier
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler

tqdm.pandas()

In [24]:
#import data
df_train = pd.read_csv('./data/Train_folds.zip')
df_test=  pd.read_csv('./data/Test.zip')
submission = pd.read_csv('./data/SampleSubmission.csv')

In [29]:
train = df_train.copy()
test = df_test.copy()

cat_cols = [
    # 'user_id',
    'REGION',
    'TENURE',
    # 'MRG',  # constant
    # 'TOP_PACK',  # wtf column
]

num_cols = [
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
]

target = 'CHURN'

mapping = {
    'D 3-6 month': 1,
    'E 6-9 month': 2,
    'F 9-12 month': 3,
    'G 12-15 month': 4,
    'H 15-18 month': 5,
    'I 18-21 month': 6,
    'J 21-24 month': 7,
    'K > 24 month': 8,
}

train['TENURE'] = train['TENURE'].map(mapping) 
test['TENURE'] = test['TENURE'].map(mapping)

train['REGION'] = train['REGION'].fillna('DAKAR')
test['REGION'] = test['REGION'].fillna('DAKAR')

for col in tqdm(num_cols):
    col_mean = train[col].mean()
    train[col] = train[col].fillna(col_mean)
    test[col] = test[col].fillna(col_mean)
    train[col] = np.log1p(train[col])
    test[col] = np.log1p(test[col])

  0%|          | 0/13 [00:00<?, ?it/s]

In [30]:
churn_by_tenure = pd.read_csv('./data/agg_by_tenure_churn.csv')
d = churn_by_tenure[['TENURE', 'CHURN_mean']].to_dict('list')
d = dict(zip(d['TENURE'], d[f'CHURN_mean']))
train['TENURE_CHURN_mean'] = train.progress_apply(lambda x: d[x['TENURE']], axis=1)
test['TENURE_CHURN_mean'] = test.progress_apply(lambda x: d[x['TENURE']], axis=1)

churn_by_region = pd.read_csv('./data/agg_by_region_churn.csv')
d = churn_by_region[['REGION', 'CHURN_mean']].to_dict('list')
d = dict(zip(d['REGION'], d[f'CHURN_mean']))
train['REGION_CHURN_mean'] = train.progress_apply(lambda x: d[x['REGION']], axis=1)
test['REGION_CHURN_mean'] = test.progress_apply(lambda x: d[x['REGION']], axis=1)

  0%|          | 0/2154048 [00:00<?, ?it/s]

  0%|          | 0/380127 [00:00<?, ?it/s]

  0%|          | 0/2154048 [00:00<?, ?it/s]

  0%|          | 0/380127 [00:00<?, ?it/s]

In [31]:
useful_cols = [
    'REGION',
    'TENURE',
    # 'MRG',  # constant
    # 'TOP_PACK',  # wtf column
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
    'TENURE_CHURN_mean',
    'REGION_CHURN_mean'
]

final_predictions = []
scores = []

for cat_col in cat_cols:
    encoder = OneHotEncoder(handle_unknown='ignore')
    unique_values = train[cat_col].unique()

    one_hot_encoded_cols = [f'{cat_col}_{i}' for i in range(len(unique_values))]
    
    ohe_df = pd.DataFrame(encoder.fit_transform(train[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = train.index
    train = train.drop(cat_col, axis=1)
    train = pd.concat([train, ohe_df], axis=1)        
    print(f'[{cat_col}] xtrain transformed')

    ohe_df = pd.DataFrame(encoder.transform(test[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = test.index
    test = test.drop(cat_col, axis=1)
    test = pd.concat([test, ohe_df], axis=1)
    print(f'[{cat_col}] xtest transformed')
    
    useful_cols += one_hot_encoded_cols
    useful_cols.remove(cat_col)
    
# scaler = StandardScaler()
# train[num_cols] = scaler.fit_transform(train[num_cols])
# test[num_cols] = scaler.transform(test[num_cols])

final_predictions = []
scores = []

target = 'CHURN'

for fold in tqdm(range(5), 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]
    
    xtest = test[useful_cols]

    model = LGBMClassifier(
        n_estimators=1000,
        random_state=42, 
#         **{
#             'learning_rate': 0.057192504267984716,
#             'reg_lambda': 0.2020682138073545,
#             'reg_alpha': 9.291468887449219e-07,
#             'subsample': 0.8824763845186918,
#             'colsample_bytree': 0.7382762785360262,
#             'max_depth': 6
#         }
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)])
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)    

print(np.mean(scores), np.std(scores))

# 0.7696186855892555 0.00043098014634677365 fillna через REGION
# 0.7694106138854746 0.000588962476667447 fillna через TENURE
# 0.7701952731313458 0.0006247026357018513 fillna через mean по колонке
# 0.769901864294696 0.000610047154013374 fillna через mean по колонке & ohe for region
# 0.769901864294696 0.000610047154013374 fillna через mean по колонке & ohe for region + StandardScaler for Tenure (same as prev)
# 0.770010208315598 0.0005090535431930928 fillna через mean по колонке & ohe for region & tenure [private: 0.853185019447002]
# 0.770008209799155 0.0005120012284348264 fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds
# 0.7701761776716778 0.0007820978999800708 fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten [private 0.862163874472749]
# 0.7690741534134118 0.0008824035170153271 fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==)
# 0.7690825678568933 0.0008789684650258934  fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==)
# 0.7698245780943753 0.000724931679586648 lgb - fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==)
# 0.7694104784482177 0.0013440865822247955 lgb - fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==) + optuna params [0.760814445393041]
# 0.7696374297117283 0.0010216807303939667 lgb - fillna через mean по колонке & ohe for region & tenure for whole train and test datasets & StScaler on whole ds + target encoding by reg & ten + изменил фолды (!= и ==) + num cols log1

[REGION] xtrain transformed
[REGION] xtest transformed
[TENURE] xtrain transformed
[TENURE] xtest transformed


folds:   0%|          | 0/5 [00:00<?, ?it/s]

[1]	valid_0's binary_logloss: 0.443644
Training until validation scores don't improve for 300 rounds
[2]	valid_0's binary_logloss: 0.414895
[3]	valid_0's binary_logloss: 0.39266
[4]	valid_0's binary_logloss: 0.374827
[5]	valid_0's binary_logloss: 0.360231
[6]	valid_0's binary_logloss: 0.34812
[7]	valid_0's binary_logloss: 0.337935
[8]	valid_0's binary_logloss: 0.329322
[9]	valid_0's binary_logloss: 0.321979
[10]	valid_0's binary_logloss: 0.315719
[11]	valid_0's binary_logloss: 0.310342
[12]	valid_0's binary_logloss: 0.305697
[13]	valid_0's binary_logloss: 0.30169
[14]	valid_0's binary_logloss: 0.298214
[15]	valid_0's binary_logloss: 0.295218
[16]	valid_0's binary_logloss: 0.292608
[17]	valid_0's binary_logloss: 0.290303
[18]	valid_0's binary_logloss: 0.288299
[19]	valid_0's binary_logloss: 0.286557
[20]	valid_0's binary_logloss: 0.28503
[21]	valid_0's binary_logloss: 0.283703
[22]	valid_0's binary_logloss: 0.282533
[23]	valid_0's binary_logloss: 0.281512
[24]	valid_0's binary_logloss: 

[203]	valid_0's binary_logloss: 0.278696
[204]	valid_0's binary_logloss: 0.277976
[205]	valid_0's binary_logloss: 0.276641
[206]	valid_0's binary_logloss: 0.276325
[207]	valid_0's binary_logloss: 0.276164
[208]	valid_0's binary_logloss: 0.276197
[209]	valid_0's binary_logloss: 0.276194
[210]	valid_0's binary_logloss: 0.276192
[211]	valid_0's binary_logloss: 0.276189
[212]	valid_0's binary_logloss: 0.276748
[213]	valid_0's binary_logloss: 0.276426
[214]	valid_0's binary_logloss: 0.27635
[215]	valid_0's binary_logloss: 0.276345
[216]	valid_0's binary_logloss: 0.27634
[217]	valid_0's binary_logloss: 0.276338
[218]	valid_0's binary_logloss: 0.276338
[219]	valid_0's binary_logloss: 0.276333
[220]	valid_0's binary_logloss: 0.276328
[221]	valid_0's binary_logloss: 0.276326
[222]	valid_0's binary_logloss: 0.276324
[223]	valid_0's binary_logloss: 0.276323
[224]	valid_0's binary_logloss: 0.276323
[225]	valid_0's binary_logloss: 0.276323
[226]	valid_0's binary_logloss: 0.278803
[227]	valid_0's bi

[405]	valid_0's binary_logloss: 0.285019
[406]	valid_0's binary_logloss: 0.285011
[407]	valid_0's binary_logloss: 0.28632
[408]	valid_0's binary_logloss: 0.284043
[409]	valid_0's binary_logloss: 0.284044
[410]	valid_0's binary_logloss: 0.284664
[411]	valid_0's binary_logloss: 0.283832
[412]	valid_0's binary_logloss: 0.285879
[413]	valid_0's binary_logloss: 0.284393
[414]	valid_0's binary_logloss: 0.28616
Early stopping, best iteration is:
[114]	valid_0's binary_logloss: 0.273436
0 0.7697445403580361
[1]	valid_0's binary_logloss: 0.443328
Training until validation scores don't improve for 300 rounds
[2]	valid_0's binary_logloss: 0.414598
[3]	valid_0's binary_logloss: 0.392349
[4]	valid_0's binary_logloss: 0.374502
[5]	valid_0's binary_logloss: 0.359872
[6]	valid_0's binary_logloss: 0.34774
[7]	valid_0's binary_logloss: 0.337525
[8]	valid_0's binary_logloss: 0.328914
[9]	valid_0's binary_logloss: 0.32154
[10]	valid_0's binary_logloss: 0.315237
[11]	valid_0's binary_logloss: 0.309852
[12]

[191]	valid_0's binary_logloss: 0.273522
[192]	valid_0's binary_logloss: 0.273516
[193]	valid_0's binary_logloss: 0.273488
[194]	valid_0's binary_logloss: 0.273486
[195]	valid_0's binary_logloss: 0.273578
[196]	valid_0's binary_logloss: 0.274951
[197]	valid_0's binary_logloss: 0.273947
[198]	valid_0's binary_logloss: 0.273938
[199]	valid_0's binary_logloss: 0.274219
[200]	valid_0's binary_logloss: 0.274187
[201]	valid_0's binary_logloss: 0.276368
[202]	valid_0's binary_logloss: 0.274789
[203]	valid_0's binary_logloss: 0.275131
[204]	valid_0's binary_logloss: 0.275033
[205]	valid_0's binary_logloss: 0.275038
[206]	valid_0's binary_logloss: 0.274851
[207]	valid_0's binary_logloss: 0.274848
[208]	valid_0's binary_logloss: 0.274842
[209]	valid_0's binary_logloss: 0.276116
[210]	valid_0's binary_logloss: 0.276203
[211]	valid_0's binary_logloss: 0.275931
[212]	valid_0's binary_logloss: 0.27639
[213]	valid_0's binary_logloss: 0.275764
[214]	valid_0's binary_logloss: 0.275686
[215]	valid_0's b

[394]	valid_0's binary_logloss: 0.280127
[395]	valid_0's binary_logloss: 0.280566
[396]	valid_0's binary_logloss: 0.280479
[397]	valid_0's binary_logloss: 0.280394
[398]	valid_0's binary_logloss: 0.280536
[399]	valid_0's binary_logloss: 0.281757
[400]	valid_0's binary_logloss: 0.280696
[401]	valid_0's binary_logloss: 0.280702
[402]	valid_0's binary_logloss: 0.2807
[403]	valid_0's binary_logloss: 0.280698
[404]	valid_0's binary_logloss: 0.280699
[405]	valid_0's binary_logloss: 0.280701
[406]	valid_0's binary_logloss: 0.281417
[407]	valid_0's binary_logloss: 0.280696
[408]	valid_0's binary_logloss: 0.280697
[409]	valid_0's binary_logloss: 0.280693
[410]	valid_0's binary_logloss: 0.280694
[411]	valid_0's binary_logloss: 0.280695
[412]	valid_0's binary_logloss: 0.280697
[413]	valid_0's binary_logloss: 0.280698
[414]	valid_0's binary_logloss: 0.280701
[415]	valid_0's binary_logloss: 0.280701
Early stopping, best iteration is:
[115]	valid_0's binary_logloss: 0.272474
1 0.7695969085275665
[1]

[5]	valid_0's binary_logloss: 0.359288
[6]	valid_0's binary_logloss: 0.347185
[7]	valid_0's binary_logloss: 0.337024
[8]	valid_0's binary_logloss: 0.328417
[9]	valid_0's binary_logloss: 0.321101
[10]	valid_0's binary_logloss: 0.314838
[11]	valid_0's binary_logloss: 0.309479
[12]	valid_0's binary_logloss: 0.304846
[13]	valid_0's binary_logloss: 0.300835
[14]	valid_0's binary_logloss: 0.297378
[15]	valid_0's binary_logloss: 0.29434
[16]	valid_0's binary_logloss: 0.291714
[17]	valid_0's binary_logloss: 0.28941
[18]	valid_0's binary_logloss: 0.287417
[19]	valid_0's binary_logloss: 0.28567
[20]	valid_0's binary_logloss: 0.284153
[21]	valid_0's binary_logloss: 0.282819
[22]	valid_0's binary_logloss: 0.281663
[23]	valid_0's binary_logloss: 0.28065
[24]	valid_0's binary_logloss: 0.279764
[25]	valid_0's binary_logloss: 0.278975
[26]	valid_0's binary_logloss: 0.278294
[27]	valid_0's binary_logloss: 0.277674
[28]	valid_0's binary_logloss: 0.27713
[29]	valid_0's binary_logloss: 0.27665
[30]	valid_

[209]	valid_0's binary_logloss: 0.275211
[210]	valid_0's binary_logloss: 0.274517
[211]	valid_0's binary_logloss: 0.274504
[212]	valid_0's binary_logloss: 0.274499
[213]	valid_0's binary_logloss: 0.274493
[214]	valid_0's binary_logloss: 0.274483
[215]	valid_0's binary_logloss: 0.274486
[216]	valid_0's binary_logloss: 0.274869
[217]	valid_0's binary_logloss: 0.275396
[218]	valid_0's binary_logloss: 0.274918
[219]	valid_0's binary_logloss: 0.275143
[220]	valid_0's binary_logloss: 0.275028
[221]	valid_0's binary_logloss: 0.275751
[222]	valid_0's binary_logloss: 0.274992
[223]	valid_0's binary_logloss: 0.275847
[224]	valid_0's binary_logloss: 0.276318
[225]	valid_0's binary_logloss: 0.275507
[226]	valid_0's binary_logloss: 0.275981
[227]	valid_0's binary_logloss: 0.27945
[228]	valid_0's binary_logloss: 0.278713
[229]	valid_0's binary_logloss: 0.275896
[230]	valid_0's binary_logloss: 0.276269
[231]	valid_0's binary_logloss: 0.277473
[232]	valid_0's binary_logloss: 0.27891
[233]	valid_0's bi

[410]	valid_0's binary_logloss: 0.286274
[411]	valid_0's binary_logloss: 0.281641
[412]	valid_0's binary_logloss: 0.310599
[413]	valid_0's binary_logloss: 0.285408
[414]	valid_0's binary_logloss: 0.283242
[415]	valid_0's binary_logloss: 0.283208
Early stopping, best iteration is:
[115]	valid_0's binary_logloss: 0.272479
3 0.771453746813668
[1]	valid_0's binary_logloss: 0.442622
Training until validation scores don't improve for 300 rounds
[2]	valid_0's binary_logloss: 0.414165
[3]	valid_0's binary_logloss: 0.392166
[4]	valid_0's binary_logloss: 0.374501
[5]	valid_0's binary_logloss: 0.360047
[6]	valid_0's binary_logloss: 0.348073
[7]	valid_0's binary_logloss: 0.338034
[8]	valid_0's binary_logloss: 0.32954
[9]	valid_0's binary_logloss: 0.322322
[10]	valid_0's binary_logloss: 0.316161
[11]	valid_0's binary_logloss: 0.310885
[12]	valid_0's binary_logloss: 0.306342
[13]	valid_0's binary_logloss: 0.302419
[14]	valid_0's binary_logloss: 0.299036
[15]	valid_0's binary_logloss: 0.29607
[16]	va

[197]	valid_0's binary_logloss: 0.276839
[198]	valid_0's binary_logloss: 0.276839
[199]	valid_0's binary_logloss: 0.276835
[200]	valid_0's binary_logloss: 0.276829
[201]	valid_0's binary_logloss: 0.27683
[202]	valid_0's binary_logloss: 0.276833
[203]	valid_0's binary_logloss: 0.27683
[204]	valid_0's binary_logloss: 0.276829
[205]	valid_0's binary_logloss: 0.276824
[206]	valid_0's binary_logloss: 0.276826
[207]	valid_0's binary_logloss: 0.276817
[208]	valid_0's binary_logloss: 0.276814
[209]	valid_0's binary_logloss: 0.27681
[210]	valid_0's binary_logloss: 0.276809
[211]	valid_0's binary_logloss: 0.276797
[212]	valid_0's binary_logloss: 0.276792
[213]	valid_0's binary_logloss: 0.276791
[214]	valid_0's binary_logloss: 0.27727
[215]	valid_0's binary_logloss: 0.278491
[216]	valid_0's binary_logloss: 0.277569
[217]	valid_0's binary_logloss: 0.280232
[218]	valid_0's binary_logloss: 0.284822
[219]	valid_0's binary_logloss: 0.278733
[220]	valid_0's binary_logloss: 0.277716
[221]	valid_0's bina

In [22]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

submission = pd.read_csv('./data/SampleSubmission.csv')
submission.CHURN = preds
submission.to_csv("./data/submission-lgb-5-folds-1000-est-42-rs-TENURE-mapped-REGION-fillna-DAKAR-REG-TEN-OHE-StScaler-num-whole-dataset-fillna-by-mean-TarEncRegTen-invert-folds-optuna.csv", index=False)