In [1]:
import re
from typing import List

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import optuna
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

tqdm.pandas()

In [2]:
# XGB
# fillna числовых колонок как средние значения по соотв колонке,
# TENURE & REGION OneHotEncoded 
# StScaler on whole dataset 
# target endocding by region and tenure

#import data
train = pd.read_csv('./data/Train_folds.zip')
# train = train[train['kfold'].isin([0, 1, 2])]

test =  pd.read_csv('./data/Test.zip')
submission = pd.read_csv('./data/SampleSubmission.csv')

cat_cols = [
    'REGION',
    'TENURE',
    'TOP_PACK'
]

num_cols = [
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
]

target = 'CHURN'

mapping = {
    'D 3-6 month': 1,
    'E 6-9 month': 2,
    'F 9-12 month': 3,
    'G 12-15 month': 4,
    'H 15-18 month': 5,
    'I 18-21 month': 6,
    'J 21-24 month': 7,
    'K > 24 month': 8,
    'OTHER': 9
}

train['TOP_PACK'] = train['TOP_PACK'].fillna('OTHER')
test['TOP_PACK'] = test['TOP_PACK'].fillna('OTHER')

train['TENURE'] = train['TENURE'].fillna('OTHER')
test['TENURE'] = test['TENURE'].fillna('OTHER')
train['TENURE'] = train['TENURE'].map(mapping)
test['TENURE'] = test['TENURE'].map(mapping)

train['REGION'] = train['REGION'].fillna('OTHER')
test['REGION'] = test['REGION'].fillna('OTHER')

for nc in tqdm(num_cols):
    mean = train[nc].mean()
    train[nc] = train[nc].fillna(mean)
    test[nc] = test[nc].fillna(mean)
    
train.shape, test.shape

  0%|          | 0/13 [00:00<?, ?it/s]

((2154048, 20), (380127, 18))

In [3]:
churn_by_tenure = pd.read_csv('./data/agg_by_tenure_churn.csv')
churn_by_tenure = churn_by_tenure.append(pd.DataFrame({'TENURE': [9], 'CHURN_mean': 0, 'CHURN_median': 0}))

train = pd.merge(train, churn_by_tenure[['TENURE', 'CHURN_mean']], left_on='TENURE', right_on='TENURE', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_TENURE'}, axis='columns')

test = pd.merge(test, churn_by_tenure[['TENURE', 'CHURN_mean']], left_on='TENURE', right_on='TENURE', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_TENURE'}, axis='columns')

train.shape, test.shape

((2154048, 21), (380127, 19))

In [4]:
churn_by_region = pd.read_csv('./data/agg_by_region_churn.csv')

vc = train[train['REGION'] == 'OTHER']['CHURN'].value_counts()
churn_by_region_mean = vc[1]/(vc[0]+vc[1])
churn_by_region = churn_by_region.append(pd.DataFrame({'REGION': ['OTHER'], 'CHURN_mean': churn_by_region_mean, 'CHURN_median': 0}))

train = pd.merge(train, churn_by_region[['REGION', 'CHURN_mean']], left_on='REGION', right_on='REGION', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_REGION'}, axis='columns')

test = pd.merge(test, churn_by_region[['REGION', 'CHURN_mean']], left_on='REGION', right_on='REGION', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_REGION'}, axis='columns')

train.shape, test.shape

((2154048, 22), (380127, 20))

In [5]:
churn_by_top_pack = train[['TOP_PACK', 'CHURN']].groupby('TOP_PACK').agg({'CHURN': ['mean', 'median']})
churn_by_top_pack.columns = ['_'.join(col).strip() for col in churn_by_top_pack.columns.values]
churn_by_top_pack_mean = np.mean(train[train['TOP_PACK'] == 'OTHER']['CHURN'])
churn_by_top_pack = churn_by_top_pack.reset_index()

d = {
    'TOP_PACK': [],
    'CHURN_mean': [],
    'CHURN_median': []
}

for tp in test['TOP_PACK'].unique():
    if tp not in churn_by_top_pack['TOP_PACK'].unique():
        d['TOP_PACK'].append(tp)
        d['CHURN_mean'].append(churn_by_top_pack_mean)
        d['CHURN_median'].append(0)
    
churn_by_top_pack = churn_by_top_pack.append(pd.DataFrame(d))

train = pd.merge(train, churn_by_top_pack[['TOP_PACK', 'CHURN_mean']], left_on='TOP_PACK', right_on='TOP_PACK', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_TOP_PACK'}, axis='columns')

test = pd.merge(test, churn_by_top_pack[['TOP_PACK', 'CHURN_mean']], left_on='TOP_PACK', right_on='TOP_PACK', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_TOP_PACK'}, axis='columns')

train.shape, test.shape

((2154048, 23), (380127, 21))

In [6]:
train.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN,kfold,MEAN_CHURN_BY_TENURE,MEAN_CHURN_BY_REGION,MEAN_CHURN_BY_TOP_PACK
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,8,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,...,2.0,NO,54,On net 200F=Unlimited _call24H,8.0,0,1,0.017714,0.014196,0.018976
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,OTHER,6,5532.116998,11.52912,5510.810334,1836.942894,13.978141,3366.450167,277.68914,...,7.553309,NO,4,OTHER,9.272461,1,2,0.024106,0.447987,0.391899
2,00001654a9d9f96303d9969d0a4a851714a4bb57,OTHER,8,3600.0,2.0,1020.0,340.0,2.0,3366.450167,90.0,...,7.553309,NO,17,On-net 1000F=10MilF;10d,1.0,0,1,0.017714,0.447987,0.014089
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,8,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,...,7.553309,NO,62,"Data:1000F=5GB,7d",11.0,0,1,0.017714,0.019235,0.101716
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,8,1000.0,1.0,985.0,328.0,1.0,3366.450167,39.0,...,7.553309,NO,11,Mixt 250F=Unlimited_call24H,2.0,0,4,0.017714,0.019235,0.042141


In [7]:
useful_cols = [
    'REGION',
    'TENURE',
    # 'MRG',  # constant
    'TOP_PACK',  # wtf column
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
    'MEAN_CHURN_BY_TENURE',
    'MEAN_CHURN_BY_REGION',
    'MEAN_CHURN_BY_TOP_PACK'
]

for cat_col in cat_cols:
    encoder = OneHotEncoder(handle_unknown='ignore')
    unique_values = train[cat_col].unique()

    one_hot_encoded_cols = [f'{cat_col}_{i}' for i in range(len(unique_values))]
    
    ohe_df = pd.DataFrame(encoder.fit_transform(train[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = train.index
    train = train.drop(cat_col, axis=1)
    train = pd.concat([train, ohe_df], axis=1)        
    print(f'[{cat_col}] xtrain transformed')

    ohe_df = pd.DataFrame(encoder.transform(test[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = test.index
    test = test.drop(cat_col, axis=1)
    test = pd.concat([test, ohe_df], axis=1)
    print(f'[{cat_col}] xtest transformed')
    
    useful_cols += one_hot_encoded_cols
    useful_cols.remove(cat_col)
    
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

[REGION] xtrain transformed
[REGION] xtest transformed
[TENURE] xtrain transformed
[TENURE] xtest transformed
[TOP_PACK] xtrain transformed
[TOP_PACK] xtest transformed


In [8]:
poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(train[num_cols])
test_poly = poly.fit_transform(test[num_cols])

poly_columns = [f'poly_{x.replace(" ", "__")}' for x in poly.get_feature_names(num_cols)] # [f"poly_{i}" for i in range(train_poly.shape[1])]
df_poly = pd.DataFrame(train_poly, columns=poly_columns, dtype=np.float32)
df_test_poly = pd.DataFrame(test_poly, columns=poly_columns, dtype=np.float32)

train = pd.concat([train, df_poly], axis=1)
test = pd.concat([test, df_test_poly], axis=1)

useful_cols += poly_columns

train.head()

Unnamed: 0,user_id,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,poly_TIGO__ZONE1__ZONE2,poly_TIGO__ZONE1__REGULARITY,poly_TIGO__ZONE1__FREQ_TOP_PACK,poly_TIGO__ZONE2__REGULARITY,poly_TIGO__ZONE2__FREQ_TOP_PACK,poly_TIGO__REGULARITY__FREQ_TOP_PACK,poly_ZONE1__ZONE2__REGULARITY,poly_ZONE1__ZONE2__FREQ_TOP_PACK,poly_ZONE1__REGULARITY__FREQ_TOP_PACK,poly_ZONE2__REGULARITY__FREQ_TOP_PACK
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,-0.2238504,0.3246513,-0.2152853,-0.2152889,0.252579,-0.3546942,0.1586511,-0.3153654,-0.5490724,...,-0.2241625,0.396786,-0.04631197,0.420789,-0.04911356,0.08693501,0.4754966,-0.0554989,0.09823759,0.1041803
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.0,-4.7969860000000006e-17,8.175326e-17,9.068654000000001e-17,1.764598e-16,...,-0.0,-2.9261220000000003e-32,0.0,4.006599e-32,-0.0,-3.612546e-32,3.490163e-32,-0.0,-3.146902e-32,4.308902e-32
2,00001654a9d9f96303d9969d0a4a851714a4bb57,-0.3373367,-0.8913132,-0.7674215,-0.7674261,-1.00118,-4.7969860000000006e-17,-0.269938,-0.3153654,-0.4000654,...,1.294332e-32,3.0469520000000005e-17,5.4348530000000004e-17,-4.172045e-17,-7.441683e-17,-0.1751826,1.6030000000000002e-32,2.859275e-32,6.730939e-17,-9.216352000000001e-17
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,1.391147,0.3246513,1.365591,1.365761,0.336163,4.265629,-0.3404107,0.04199848,-0.5242379,...,1.696068e-32,-1.22781e-16,-1.48723e-17,1.681181e-16,2.036393e-17,-0.1474176,-4.929486e-32,-5.971025e-33,4.322517e-17,-5.918616000000001e-17
4,000028d9e13a595abe061f9b58f3d76ab907850f,-0.7912821,-0.9848489,-0.7734026,-0.773578,-1.084764,-4.7969860000000006e-17,-0.3432872,-0.4557583,1.764598e-16,...,-0.0,-2.074179e-32,-2.107411e-32,2.840073e-32,2.885577e-32,1.048379e-16,2.473999e-32,2.513637e-32,9.132468e-17,-1.250465e-16


In [15]:
original_columns = [x for x in train.columns if not x.startswith(('poly', 'MEAN_', 'REGION_', 'TENURE_', 'TOP_PACK_'))]
train[original_columns]

Unnamed: 0,user_id,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,FREQ_TOP_PACK,CHURN,kfold
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,-2.238504e-01,3.246513e-01,-2.152853e-01,-2.152889e-01,0.252579,-3.546942e-01,1.586511e-01,-3.153654e-01,-5.490724e-01,-6.204583e-01,-6.579922e-01,NO,1.164700,-1.359412e-01,0,1
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.000000,-4.796986e-17,8.175326e-17,9.068654e-17,1.764598e-16,1.537148e-16,-2.104743e-16,NO,-1.078775,1.897739e-16,1,2
2,00001654a9d9f96303d9969d0a4a851714a4bb57,-3.373367e-01,-8.913132e-01,-7.674215e-01,-7.674261e-01,-1.001180,-4.796986e-17,-2.699380e-01,-3.153654e-01,-4.000654e-01,1.537148e-16,-2.104743e-16,NO,-0.495472,-8.837738e-01,0,1
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,1.391147e+00,3.246513e-01,1.365591e+00,1.365761e+00,0.336163,4.265629e+00,-3.404107e-01,4.199848e-02,-5.242379e-01,1.537148e-16,-2.104743e-16,NO,1.523656,1.845585e-01,0,1
4,000028d9e13a595abe061f9b58f3d76ab907850f,-7.912821e-01,-9.848489e-01,-7.734026e-01,-7.735780e-01,-1.084764,-4.796986e-17,-3.432872e-01,-4.557583e-01,1.764598e-16,1.537148e-16,-2.104743e-16,NO,-0.764689,-7.769406e-01,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154043,ffffe85215ddc71a84f95af0afb0deeea90e6967,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.000000,-4.796986e-17,8.175326e-17,9.068654e-17,1.764598e-16,1.537148e-16,-2.104743e-16,NO,-0.989036,1.897739e-16,0,2
2154044,ffffeaaa9289cdba0ac000f0ab4b48f4aa74ed15,9.914917e-02,3.246513e-01,4.941878e-02,4.924485e-02,0.085411,-2.896089e-01,-3.619840e-01,-3.536544e-01,4.194734e-01,1.537148e-16,-2.104743e-16,NO,1.209569,-2.910794e-02,0,3
2154045,fffff172fda1b4bb38a95385951908bb92379809,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.000000,-4.796986e-17,8.175326e-17,9.068654e-17,1.764598e-16,1.537148e-16,-2.104743e-16,NO,-1.213384,1.897739e-16,1,2
2154046,fffff5911296937a37f09a37a549da2e0dad6dbb,7.800672e-01,-4.949161e-02,2.749897e-01,2.748162e-01,-0.081757,-4.796986e-17,-3.993777e-01,2.844954e-01,-2.510583e-01,1.537148e-16,-2.104743e-16,NO,-0.001907,2.913918e-01,0,4


In [47]:
for col in [x for x in original_columns if x != 'user_id' and x != 'MRG']:
    print(f'{col}:', train[[col]].corrwith(train['CHURN']).iloc[-1])

MONTANT: -0.04725325016612715
FREQUENCE_RECH: -0.05552728780708329
REVENUE: -0.05380789684838681
ARPU_SEGMENT: -0.053807976689509644
FREQUENCE: -0.06573370680557317
DATA_VOLUME: -0.015397683867390988
ON_NET: -0.026601387005991874
ORANGE: -0.02446127513667626
TIGO: -0.009995508562641881
ZONE1: 0.00144668078351635
ZONE2: 0.0004906335298515414
REGULARITY: -0.47999140986851524
FREQ_TOP_PACK: -0.03262768810648328
CHURN: 1.0
kfold: -0.0008295540518998162


In [87]:
# from sklearn.preprocessing import MinMaxScaler

# def corr(df: pd.DataFrame) -> None:
#     for col in [x for x in list(df) if x != 'CHURN']:
#         print(f'{col}:', df[[col]].corrwith(df['CHURN']).iloc[-1])


# tmp = pd.DataFrame({
#     'CHURN':  train['CHURN'],
#     'REG': train['REGULARITY'],
# })
# tmp['REG_LOG'] = tmp['REG'].apply(np.log)
# tmp['REG_SQRT'] = tmp['REG'].apply(np.sqrt)
# tmp['REG_ABS'] = tmp['REG'].apply(np.abs)
# tmp['REG_POW2'] = tmp['REG'].apply(lambda x: x ** 2)

# tmp['REG_ABS_LOG'] = tmp['REG_ABS'].apply(np.log)
# tmp['REG_ABS_SQRT'] = tmp['REG_ABS'].apply(np.sqrt)

# corr(tmp)

# scaler = MinMaxScaler() 
# arr_scaled = scaler.fit_transform([tmp['REG_ABS_SQRT']])
maximum = tmp['REG_ABS_SQRT'].max()
minumum = tmp['REG_ABS_SQRT'].min()
pd.DataFrame({'c': tmp['REG_ABS_SQRT'].apply(lambda x: (x - minumum) / (maximum - minumum))}).corrwith(tmp['CHURN'])

c    0.171667
dtype: float64

In [9]:
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    return df


def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df


def optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame:
    for col in df.select_dtypes(include=['object']):
        if col not in datetime_features:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if float(num_unique_values) / num_total_values < 0.5:
                df[col] = df[col].astype('category')
        else:
            df[col] = pd.to_datetime(df[col])
    return df



def optimize(df: pd.DataFrame, datetime_features: List[str] = []):
    return optimize_floats(optimize_ints(optimize_objects(df, datetime_features)))

train = optimize(train, [])

In [10]:
sum(train.memory_usage())/1024/1024

4615.922901153564

In [16]:
def fit_predict(xtrain: pd.DataFrame,
                ytrain: pd.DataFrame,
                xvalid: pd.DataFrame = None,
                yvalid: pd.DataFrame = None,
                valid_ids: list[str] = None):
    xtest = test[useful_cols]

    model = LGBMClassifier(
        n_estimators=7000,
        n_jobs=-1,
        random_state=42,
#         xgb
#         **{
#             'learning_rate': 0.014461849398074727,
#             'reg_lambda': 0.08185850904776007,
#             'reg_alpha': 0.0001173486815850512,
#             'subsample': 0.7675905290878289,
#             'colsample_bytree': 0.2708299922996371,
#             'max_depth': 7
#         }
#         lbg
        **{
            'learning_rate': 0.029253877255476443,
            'reg_lambda': 16.09426889606859,
            'reg_alpha': 0.014354120473120952,
            'subsample': 0.43289663848783977,
            'colsample_bytree': 0.5268279718406376,
            'max_depth': 6
        }
    )
    
    if xvalid is not None:
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        preds_valid = model.predict_proba(xvalid)[:, 1]
        test_preds = model.predict_proba(xtest)[:, 1]
        score = roc_auc_score(yvalid, preds_valid)
        print(fold, score)

        return model, test_preds, dict(zip(valid_ids, preds_valid)), score
    
    model.fit(xtrain, ytrain, verbose=1000)
    return model, None, None, None

In [11]:
final_test_predictions = []
final_valid_predictions = {}

scores = []

for fold in tqdm(range(2), 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]
    
    valid_ids = train[train['kfold'] == fold]['user_id'].values.tolist()
    
#     xtest = test[useful_cols]

#     model = LGBMClassifier(
#         n_estimators=7000,
#         n_jobs=-1,
#         random_state=42,
#         tree_method='gpu_hist',
#         gpu_id=0,
#         predictor="gpu_predictor",
#         **{
#             'learning_rate': 0.014461849398074727,
#             'reg_lambda': 0.08185850904776007,
#             'reg_alpha': 0.0001173486815850512,
#             'subsample': 0.7675905290878289,
#             'colsample_bytree': 0.2708299922996371,
#             'max_depth': 7
#         }
#     )
#     model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        
#     preds_valid = model.predict_proba(xvalid)[:, 1]
#     test_preds = model.predict_proba(xtest)[:, 1]
#     final_test_predictions.append(test_preds)
#     final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
#     score = roc_auc_score(yvalid, preds_valid)
    
    model, test_preds, val_preds, score = fit_predict(xtrain, ytrain, xvalid, yvalid, valid_ids)
    
    del xtrain
    del ytrain
    del xvalid
    del yvalid
    del valid_ids
    del model
    
    scores.append(score)
    print(fold, score)    

print(np.mean(scores), np.std(scores))

folds:   0%|          | 0/2 [00:00<?, ?it/s]

Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.251698
[2000]	valid_0's binary_logloss: 0.251602
Early stopping, best iteration is:
[2163]	valid_0's binary_logloss: 0.251591
0 0.9314657431614233
0 0.9314657431614233
Training until validation scores don't improve for 300 rounds
[1000]	valid_0's binary_logloss: 0.250535
[2000]	valid_0's binary_logloss: 0.250481
Early stopping, best iteration is:
[1892]	valid_0's binary_logloss: 0.25048
1 0.9320915934937067
1 0.9320915934937067
0.9317786683275651 0.00031292516614173094


In [17]:
final_test_predictions = []
final_valid_predictions = {}

scores = []

model, test_preds, val_preds, score = fit_predict(train[useful_cols], train[target])
preds = model.predict_proba(test[useful_cols])
preds

array([[9.98869057e-01, 1.13094347e-03],
       [9.26784623e-01, 7.32153775e-02],
       [9.99657005e-01, 3.42994743e-04],
       ...,
       [7.47533280e-01, 2.52466720e-01],
       [8.70642384e-01, 1.29357616e-01],
       [9.88476651e-01, 1.15233488e-02]])

In [18]:
sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = preds[:, 1]
sample_submission.to_csv("./data/2021-10-29-01-03-lgb-full-dataset.csv", index=False)

In [15]:
def fit_predict(xtrain: pd.DataFrame, ytrain: pd.DataFrame, xvalid: pd.DataFrame, yvalid: pd.DataFrame, valid_ids: list[str]):
    xtest = test[useful_cols]

    model = XGBClassifier(
        n_estimators=7000,
        n_jobs=-1,
        random_state=42,
        tree_method='gpu_hist',
        gpu_id=0,
        predictor="gpu_predictor",
#         xgb
#         **{
#             'learning_rate': 0.014461849398074727,
#             'reg_lambda': 0.08185850904776007,
#             'reg_alpha': 0.0001173486815850512,
#             'subsample': 0.7675905290878289,
#             'colsample_bytree': 0.2708299922996371,
#             'max_depth': 7
#         }
#         lbg
        **{
            'learning_rate': 0.029253877255476443,
            'reg_lambda': 16.09426889606859,
            'reg_alpha': 0.014354120473120952,
            'subsample': 0.43289663848783977,
            'colsample_bytree': 0.5268279718406376,
            'max_depth': 6
        }
    )    
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        
    preds_valid = model.predict_proba(xvalid)[:, 1]
    test_preds = model.predict_proba(xtest)[:, 1]
    score = roc_auc_score(yvalid, preds_valid)
    print(fold, score)
    
    return test_preds, dict(zip(valid_ids, preds_valid)), score

final_test_predictions = []
final_valid_predictions = {}

scores = []

folds = train['kfold'].unique()
for fold in tqdm([0,1,2], 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]
    
    valid_ids = train[train['kfold'] == fold]['user_id'].values.tolist()
    
    test_preds, val_preds, score = fit_predict(xtrain, ytrain, xvalid, yvalid, valid_ids)
    
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(val_preds)
    scores.append(score)

print(np.mean(scores), np.std(scores))

folds:   0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [39]:
sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = np.mean(np.column_stack(final_test_predictions), axis=1)
# sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("./data/2021-10-27-xgb.csv", index=False)

In [40]:
sample_submission

Unnamed: 0,user_id,CHURN
0,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.001402
1,000055d41c8a62052dd426592e8a4a3342bf565d,0.077221
2,000081dd3245e6869a4a9c574c7050e7bb84c2c8,0.001036
3,0000b76d2145d9445d9ff6b65c9ebc4196c89337,0.403356
4,0000bae5480628cf8fe51ad84bcb39772fc79224,0.379176
...,...,...
380122,fffe7e03c7eede2ad0a728ee516c4d342dd16107,0.000921
380123,fffec230e6a1aa51ab37d0051ece42de611e71c6,0.789961
380124,ffff0dcc1ab9812bf205b6d76e9d084053cd96f5,0.258537
380125,ffff91ea6a09a0c8ea42bc6ae33df4b5e06283dc,0.124059


In [None]:
val = train[train['kfold'] == 1]
train = train[train['kfold'] == 0]

print(train.shape, val.shape)

In [18]:
train_copy = train.copy()
test_copy = test.copy()
val_copy = val.copy()

minmax_scaler_cols = ['DATA_VOLUME', 'ON_NET']
scaler = MinMaxScaler()
train_copy[minmax_scaler_cols] = scaler.fit_transform(train_copy[minmax_scaler_cols])
test_copy[minmax_scaler_cols] = scaler.transform(test_copy[minmax_scaler_cols])

standard_scaler_cols = [col for col in train_copy.columns if col not in set(['user_id', 'MRG', 'TOP_PACK', 'REGION', 'DATA_VOLUME', 'ON_NET', 'kfold', 'CHURN'])]
scaler = StandardScaler()
train_copy[standard_scaler_cols] = scaler.fit_transform(train_copy[standard_scaler_cols])
test_copy[standard_scaler_cols] = scaler.transform(test_copy[standard_scaler_cols])

# poly features
numerical_cols = [
    'DATA_VOLUME',
    'ON_NET',
# 'MONTANT',
# 'FREQUENCE_RECH',
# 'REVENUE',
# 'ARPU_SEGMENT',
# 'FREQUENCE',
 'ORANGE',
 'TIGO',
# 'ZONE1',
# 'ZONE2',
# 'REGULARITY',
]
poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(train_copy[numerical_cols])
test_poly = poly.fit_transform(test_copy[numerical_cols])

poly_columns = [f"poly_{i}" for i in range(train_poly.shape[1])]
df_poly = pd.DataFrame(train_poly, columns=poly_columns)
df_test_poly = pd.DataFrame(test_poly, columns=poly_columns)

train_copy = pd.concat([train_copy, df_poly], axis=1)
test_copy = pd.concat([test_copy, df_test_poly], axis=1)

useful_cols += poly_columns

for cat_col in cat_cols:
    encoder = OneHotEncoder(handle_unknown='ignore')
    unique_values = train_copy[cat_col].unique()

    one_hot_encoded_cols = [f'{cat_col}_{i}' for i in range(len(unique_values))]
    
    ohe_df = pd.DataFrame(encoder.fit_transform(train_copy[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = train_copy.index
    train_copy = train_copy.drop(cat_col, axis=1)
    train_copy = pd.concat([train_copy, ohe_df], axis=1)        
    print(f'[{cat_col}] xtrain transformed')

    ohe_df = pd.DataFrame(encoder.transform(test_copy[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = test_copy.index
    test_copy = test_copy.drop(cat_col, axis=1)
    test_copy = pd.concat([test_copy, ohe_df], axis=1)
    print(f'[{cat_col}] xtest transformed')
    
    useful_cols += one_hot_encoded_cols
    useful_cols.remove(cat_col)

NameError: name 'test_copy' is not defined

(430810, 33) (430810, 33)


In [15]:
useful_cols = [col for col in train.columns if col not in set(['user_id', 
                                                               'MRG',
                                                               'TOP_PACK', 
                                                               'CHURN',
                                                               'kfold'])]

lgb = LGBMClassifier(random_state=42)
lgb.fit(train[useful_cols], train[target])

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: REGION

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN,kfold
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,8,8.354910,2.772589,8.355145,7.257003,2.890372,1.609438,5.963579,3.850148,0.693147,0.693147,1.098612,NO,4.007333,On net 200F=Unlimited _call24H,2.197225,0,1
2,00001654a9d9f96303d9969d0a4a851714a4bb57,DAKAR,8,8.188967,1.098612,6.928538,5.831882,1.098612,8.131405,4.510860,3.850148,2.079442,2.200821,2.150355,NO,2.890372,On-net 1000F=10MilF;10d,0.693147,0,1
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,8,9.510519,2.772589,9.510667,8.412277,2.944439,10.687503,3.737670,4.634729,1.098612,2.200821,2.150355,NO,4.143135,"Data:1000F=5GB,7d",2.484907,0,1
7,0000313946b6849745963442c6e572d47cd24ced,DAKAR,8,8.853808,2.833213,8.885994,7.787797,3.135494,7.379008,4.356709,3.401197,4.615121,2.200821,2.150355,NO,4.025352,All-net 500F=2000F;5d,2.197225,0,1
11,00005b7c61f811e4eef1a05903a4b700afd23e46,KAOLACK,8,9.445492,3.332205,9.381854,8.283494,3.610918,9.042395,6.068426,4.204693,1.945910,2.200821,2.150355,NO,4.143135,On net 200F=Unlimited _call24H,2.484907,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154032,ffff56138e6bf8e553514dfb97ee7cbe0f6cc609,DAKAR,8,8.618473,2.527874,8.614417,7.516171,2.706459,8.131405,5.631071,4.569530,3.181399,2.200821,2.150355,NO,0.693147,,2.330454,0,0
2154035,ffff6e41acb8a069e888c4e8fbd9779f1e0bde73,SAINT-LOUIS,8,8.756368,2.772589,8.695172,7.596894,3.332205,6.003887,5.598422,4.997212,3.181399,1.945910,2.150355,NO,4.110874,Mixt 250F=Unlimited_call24H,2.484907,0,1
2154036,ffff8da611b1f7591fae91245f93a6dcf276056a,SAINT-LOUIS,8,8.699681,2.564949,8.922658,7.824446,2.772589,8.131405,4.624973,5.181784,3.295837,2.200821,0.000000,NO,4.060443,MIXT:500F= 2500F on net _2500F off net;2d,2.197225,0,0
2154038,ffffb2b8b63959b8a374e2a2ccaf2b9e521879ad,DAKAR,8,6.908755,1.098612,6.908755,5.811141,1.098612,0.000000,1.098612,2.564949,1.386294,2.200821,2.150355,NO,2.564949,All-net 500F=2000F;5d,1.098612,0,1
