In [88]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from scipy.stats import mode

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

In [102]:
#import data
train = pd.read_csv('./data/Train_folds.csv')
test=  pd.read_csv('./data/Test.csv')
submission = pd.read_csv('./data/SampleSubmission.csv')
columns = pd.read_csv('./data/VariableDefinitions.csv')

In [80]:
columns

Unnamed: 0,Variable Definitions,Unnamed: 1,Unnamed: 2
0,,French,English
1,,Le dataset churn comprend 19 variables dont 15...,The churn dataset includes 19 variables includ...
2,user_id,,
3,REGION,la localité de chaque client,the location of each client
4,TENURE,la durée dans le reseau,duration in the network
5,MONTANT,montant de recharge,top-up amount
6,FREQUENCE_RECH,nombre de fois que le client a fait une recharge,number of times the customer refilled
7,REVENUE,revenu mensuel de chaque client,monthly income of each client
8,ARPU_SEGMENT,revenu sur 90 jours/3,income over 90 days / 3
9,FREQUENCE,nombre de fois que client à fait un revenu,number of times the client has made an income


In [75]:
cat_cols = [
    # 'user_id',
    'REGION',
    # 'TENURE',
    # 'MRG',  # constant
    # 'TOP_PACK',  # wtf column
]

num_cols = [
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
]

target = 'CHURN'

useful_cols = [
    'REGION',
    'TENURE',
    # 'MRG',  # constant
    # 'TOP_PACK',  # wtf column
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
]

for n_col in tqdm(num_cols, 'prepare n_cols'):
    train[n_col] = train[n_col].fillna(train[n_col].mean())

mapping = {
    'D 3-6 month': 1,
    'E 6-9 month': 2,
    'F 9-12 month': 3,
    'G 12-15 month': 4,
    'H 15-18 month': 5,
    'I 18-21 month': 6,
    'J 21-24 month': 7,
    'K > 24 month': 8,
}
train['TENURE'] = train['TENURE'].map(mapping) 
test['TENURE'] = test['TENURE'].map(mapping)

train['REGION'] = train['REGION'].fillna('DAKAR')
test['REGION'] = test['REGION'].fillna('DAKAR')
    
final_predictions = []
scores = []

for fold in tqdm(range(5), 'folds'):
    xtrain = train[train['kfold'] == fold][useful_cols]
    ytrain = train[train['kfold'] == fold][target]
    
    xvalid = train[train['kfold'] != fold][useful_cols]
    yvalid = train[train['kfold'] != fold][target]
    
    xtest = test[useful_cols]
    
    scaler = StandardScaler()
    xtrain[num_cols] = scaler.fit_transform(xtrain[num_cols])
    xvalid[num_cols] = scaler.transform(xvalid[num_cols])
    xtest[num_cols] = scaler.transform(xtest[num_cols])
    
    encoder = OrdinalEncoder()
    xtrain[cat_cols] = encoder.fit_transform(xtrain[cat_cols])
    xvalid[cat_cols] = encoder.transform(xvalid[cat_cols])
    xtest[cat_cols] = encoder.transform(xtest[cat_cols])
    
    model = XGBClassifier(
        n_estimators=1000,
        n_jobs=-1,
        random_state=42,
        use_label_encoder=False
    )
    
    model.fit(xtrain, ytrain, eval_metric=roc_auc_score, verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)    

print(np.mean(scores), np.std(scores))

prepare n_cols:   0%|          | 0/13 [00:00<?, ?it/s]

folds:   0%|          | 0/5 [00:00<?, ?it/s]

0 0.7694541655272316
1 0.7692049182510425
2 0.7693446538437609
3 0.7696309418519532
4 0.7682226112305172


In [76]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [77]:
submission.CHURN = preds
submission.to_csv("./data/submission-xgb-5-folds-1000-est-42-rs-TENURE-mapped-REGION-fillna-DAKAR-OrdEncoder-StScaler.csv", index=False)

In [108]:
cp = train.copy()
cp['TENURE'] = cp['TENURE'].map(mapping)
agg = cp[~train['REGION'].isna()].groupby('REGION').agg({
    'TENURE': lambda x: mode(x)[0],
})
agg

Unnamed: 0_level_0,TENURE
REGION,Unnamed: 1_level_1
DAKAR,8
DIOURBEL,8
FATICK,8
KAFFRINE,8
KAOLACK,8
KEDOUGOU,8
KOLDA,8
LOUGA,8
MATAM,8
SAINT-LOUIS,8


In [109]:
cp[~cp['REGION'].isna()].groupby('TENURE').agg({'REGION': lambda x: mode(x)[0], 'MONTANT': ['mean', 'median']})

Unnamed: 0_level_0,REGION,MONTANT,MONTANT
Unnamed: 0_level_1,<lambda>,mean,median
TENURE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,DAKAR,2028.181818,1200.0
2,DAKAR,6372.978912,3000.0
3,DAKAR,5618.158295,3450.0
4,DAKAR,5594.414352,3300.0
5,DAKAR,5305.156368,3200.0
6,DAKAR,5788.95404,3500.0
7,DAKAR,6058.653333,3750.0
8,DAKAR,6106.900675,3650.0
