In [1]:
import tensorflow as tf
import keras

import re
from typing import List

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import optuna
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

tqdm.pandas()

In [2]:
# XGB
# fillna числовых колонок как средние значения по соотв колонке,
# TENURE & REGION OneHotEncoded 
# StScaler on whole dataset 
# target endocding by region and tenure

#import data
train = pd.read_csv('./data/Train_folds.zip')
test=  pd.read_csv('./data/Test.zip')
submission = pd.read_csv('./data/SampleSubmission.csv')

cat_cols = [
    'REGION',
    'TENURE',
    'TOP_PACK'
]

num_cols = [
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
]

target = 'CHURN'

mapping = {
    'D 3-6 month': 1,
    'E 6-9 month': 2,
    'F 9-12 month': 3,
    'G 12-15 month': 4,
    'H 15-18 month': 5,
    'I 18-21 month': 6,
    'J 21-24 month': 7,
    'K > 24 month': 8,
    'OTHER': 9
}

train['TOP_PACK'] = train['TOP_PACK'].fillna('OTHER')
test['TOP_PACK'] = test['TOP_PACK'].fillna('OTHER')

train['TENURE'] = train['TENURE'].fillna('OTHER')
test['TENURE'] = test['TENURE'].fillna('OTHER')
train['TENURE'] = train['TENURE'].map(mapping)
test['TENURE'] = test['TENURE'].map(mapping)

train['REGION'] = train['REGION'].fillna('OTHER')
test['REGION'] = test['REGION'].fillna('OTHER')

for nc in tqdm(num_cols):
    mean = train[nc].mean()
    train[nc] = train[nc].fillna(mean)
    test[nc] = test[nc].fillna(mean)
    
train.shape, test.shape

  0%|          | 0/13 [00:00<?, ?it/s]

((2154048, 20), (380127, 18))

In [3]:
churn_by_tenure = pd.read_csv('./data/agg_by_tenure_churn.csv')
churn_by_tenure = churn_by_tenure.append(pd.DataFrame({'TENURE': [9], 'CHURN_mean': 0, 'CHURN_median': 0}))

train = pd.merge(train, churn_by_tenure[['TENURE', 'CHURN_mean']], left_on='TENURE', right_on='TENURE', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_TENURE'}, axis='columns')

test = pd.merge(test, churn_by_tenure[['TENURE', 'CHURN_mean']], left_on='TENURE', right_on='TENURE', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_TENURE'}, axis='columns')

train.shape, test.shape

((2154048, 21), (380127, 19))

In [4]:
churn_by_region = pd.read_csv('./data/agg_by_region_churn.csv')

vc = train[train['REGION'] == 'OTHER']['CHURN'].value_counts()
churn_by_region_mean = vc[1]/(vc[0]+vc[1])
churn_by_region = churn_by_region.append(pd.DataFrame({'REGION': ['OTHER'], 'CHURN_mean': churn_by_region_mean, 'CHURN_median': 0}))

train = pd.merge(train, churn_by_region[['REGION', 'CHURN_mean']], left_on='REGION', right_on='REGION', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_REGION'}, axis='columns')

test = pd.merge(test, churn_by_region[['REGION', 'CHURN_mean']], left_on='REGION', right_on='REGION', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_REGION'}, axis='columns')

train.shape, test.shape

((2154048, 22), (380127, 20))

In [5]:
# train['TOP_PACK'] = train['TOP_PACK'].fillna('OTHER')
# test['TOP_PACK'] = test['TOP_PACK'].fillna('OTHER')

churn_by_top_pack = train[['TOP_PACK', 'CHURN']].groupby('TOP_PACK').agg({'CHURN': ['mean', 'median']})
churn_by_top_pack.columns = ['_'.join(col).strip() for col in churn_by_top_pack.columns.values]
churn_by_top_pack_mean = np.mean(train[train['TOP_PACK'] == 'OTHER']['CHURN'])
churn_by_top_pack = churn_by_top_pack.reset_index()

d = {
    'TOP_PACK': [],
    'CHURN_mean': [],
    'CHURN_median': []
}

for tp in test['TOP_PACK'].unique():
    if tp not in churn_by_top_pack['TOP_PACK'].unique():
        d['TOP_PACK'].append(tp)
        d['CHURN_mean'].append(churn_by_top_pack_mean)
        d['CHURN_median'].append(0)
    
churn_by_top_pack = churn_by_top_pack.append(pd.DataFrame(d))

train = pd.merge(train, churn_by_top_pack[['TOP_PACK', 'CHURN_mean']], left_on='TOP_PACK', right_on='TOP_PACK', how='left')
train = train.rename({'CHURN_mean': 'MEAN_CHURN_BY_TOP_PACK'}, axis='columns')

test = pd.merge(test, churn_by_top_pack[['TOP_PACK', 'CHURN_mean']], left_on='TOP_PACK', right_on='TOP_PACK', how='left')
test = test.rename({'CHURN_mean': 'MEAN_CHURN_BY_TOP_PACK'}, axis='columns')

train.shape, test.shape

((2154048, 23), (380127, 21))

In [6]:
useful_cols = [
    'REGION',
    'TENURE',
    # 'MRG',  # constant
    'TOP_PACK',  # wtf column
    'MONTANT',
    'FREQUENCE_RECH',
    'REVENUE',
    'ARPU_SEGMENT',
    'FREQUENCE',
    'DATA_VOLUME',
    'ON_NET', 
    'ORANGE',
    'TIGO',
    'ZONE1',
    'ZONE2',
    'REGULARITY',
    'FREQ_TOP_PACK',
    'MEAN_CHURN_BY_TENURE',
    'MEAN_CHURN_BY_REGION',
    'MEAN_CHURN_BY_TOP_PACK'
]

for cat_col in cat_cols:
    encoder = OneHotEncoder(handle_unknown='ignore')
    unique_values = train[cat_col].unique()

    one_hot_encoded_cols = [f'{cat_col}_{i}' for i in range(len(unique_values))]
    
    ohe_df = pd.DataFrame(encoder.fit_transform(train[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = train.index
    train = train.drop(cat_col, axis=1)
    train = pd.concat([train, ohe_df], axis=1)        
    print(f'[{cat_col}] xtrain transformed')

    ohe_df = pd.DataFrame(encoder.transform(test[[cat_col]]).toarray(), columns=one_hot_encoded_cols)
    ohe_df.index = test.index
    test = test.drop(cat_col, axis=1)
    test = pd.concat([test, ohe_df], axis=1)
    print(f'[{cat_col}] xtest transformed')
    
    useful_cols += one_hot_encoded_cols
    useful_cols.remove(cat_col)
    
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

[REGION] xtrain transformed
[REGION] xtest transformed
[TENURE] xtrain transformed
[TENURE] xtest transformed
[TOP_PACK] xtrain transformed
[TOP_PACK] xtest transformed


In [7]:
poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(train[num_cols])
test_poly = poly.fit_transform(test[num_cols])

poly_columns = [f'poly_{x.replace(" ", "__")}' for x in poly.get_feature_names(num_cols)] # [f"poly_{i}" for i in range(train_poly.shape[1])]
df_poly = pd.DataFrame(train_poly, columns=poly_columns, dtype=np.float32)
df_test_poly = pd.DataFrame(test_poly, columns=poly_columns, dtype=np.float32)

train = pd.concat([train, df_poly], axis=1)
test = pd.concat([test, df_test_poly], axis=1)

useful_cols += poly_columns

train.head()

Unnamed: 0,user_id,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,poly_TIGO__ZONE1__ZONE2,poly_TIGO__ZONE1__REGULARITY,poly_TIGO__ZONE1__FREQ_TOP_PACK,poly_TIGO__ZONE2__REGULARITY,poly_TIGO__ZONE2__FREQ_TOP_PACK,poly_TIGO__REGULARITY__FREQ_TOP_PACK,poly_ZONE1__ZONE2__REGULARITY,poly_ZONE1__ZONE2__FREQ_TOP_PACK,poly_ZONE1__REGULARITY__FREQ_TOP_PACK,poly_ZONE2__REGULARITY__FREQ_TOP_PACK
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,-0.2238504,0.3246513,-0.2152853,-0.2152889,0.252579,-0.3546942,0.1586511,-0.3153654,-0.5490724,...,-0.2241625,0.396786,-0.04631197,0.420789,-0.04911356,0.08693501,0.4754966,-0.0554989,0.09823759,0.1041803
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,4.763779e-16,4.984585e-16,1.554209e-16,3.496977e-16,0.0,-4.7969860000000006e-17,8.175326e-17,9.068654000000001e-17,1.764598e-16,...,-0.0,-2.9261220000000003e-32,0.0,4.006599e-32,-0.0,-3.612546e-32,3.490163e-32,-0.0,-3.146902e-32,4.308902e-32
2,00001654a9d9f96303d9969d0a4a851714a4bb57,-0.3373367,-0.8913132,-0.7674215,-0.7674261,-1.00118,-4.7969860000000006e-17,-0.269938,-0.3153654,-0.4000654,...,1.294332e-32,3.0469520000000005e-17,5.4348530000000004e-17,-4.172045e-17,-7.441683e-17,-0.1751826,1.6030000000000002e-32,2.859275e-32,6.730939e-17,-9.216352000000001e-17
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,1.391147,0.3246513,1.365591,1.365761,0.336163,4.265629,-0.3404107,0.04199848,-0.5242379,...,1.696068e-32,-1.22781e-16,-1.48723e-17,1.681181e-16,2.036393e-17,-0.1474176,-4.929486e-32,-5.971025e-33,4.322517e-17,-5.918616000000001e-17
4,000028d9e13a595abe061f9b58f3d76ab907850f,-0.7912821,-0.9848489,-0.7734026,-0.773578,-1.084764,-4.7969860000000006e-17,-0.3432872,-0.4557583,1.764598e-16,...,-0.0,-2.074179e-32,-2.107411e-32,2.840073e-32,2.885577e-32,1.048379e-16,2.473999e-32,2.513637e-32,9.132468e-17,-1.250465e-16


In [8]:
sum(train.memory_usage())/1024/1024

6138.129638671875

In [9]:
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    return df


def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df


def optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame:
    for col in df.select_dtypes(include=['object']):
        if col not in datetime_features:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if float(num_unique_values) / num_total_values < 0.5:
                df[col] = df[col].astype('category')
        else:
            df[col] = pd.to_datetime(df[col])
    return df



def optimize(df: pd.DataFrame, datetime_features: List[str] = []):
    return optimize_floats(optimize_ints(optimize_objects(df, datetime_features)))

train = optimize(train, [])

In [10]:
sum(train.memory_usage())/1024/1024

4615.922901153564

In [11]:
model = keras.Sequential()
model.add(keras.layers.Dense(12, input_dim=557, activation='relu'))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model

<keras.engine.sequential.Sequential at 0x24c09dc48b0>

In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(name='auc')])

In [13]:
final_test_predictions = []
final_valid_predictions = {}

scores = []

xtest = test[useful_cols]
for fold in tqdm(range(5), 'folds'):
    xtrain = train[train['kfold'] != fold][useful_cols]
    ytrain = train[train['kfold'] != fold][target]
    
    xvalid = train[train['kfold'] == fold][useful_cols]
    yvalid = train[train['kfold'] == fold][target]

    valid_ids = train[train['kfold'] == fold]['user_id'].values.tolist()

    model = keras.Sequential()
    model.add(keras.layers.Dense(12, input_dim=557, activation='relu'))
    model.add(keras.layers.Dense(8, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(name='auc')])
    model.fit(xtrain, ytrain, epochs=30, batch_size=1000, validation_split=0.3)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    score = roc_auc_score(yvalid, preds_valid)
    scores.append(score)
    print(fold, score)
    del xtrain
    del ytrain
    del xvalid
    del yvalid
    del model
    del preds_valid
    del test_preds

print(np.mean(scores), np.std(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_6"]
final_valid_predictions.to_csv("./data/train_pred_6.csv", index=False)

sample_submission = pd.read_csv('./data/SampleSubmission.csv')
sample_submission['CHURN'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_6"]
sample_submission.to_csv("./data/test_pred_6.csv", index=False)

sample_submission.sample(7)

folds:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
0 0.9303084979352504
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
1 0.930962512676331
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Ep

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
4 0.9292271775653724
0.9301588339978837 0.0005735803826214468


Unnamed: 0,id,pred_6
361300,f33058fa7798a2d4aa8c668a3c93a53a322b0506,0.104868
74985,32547bec048edf0bcd821192933de5f10dba1478,0.000963
269155,b540ff6bd868e7bb325be34d076e381e390bd122,0.273381
145880,623c1461cd543e3d3a99486a582388371b1b016a,0.321615
324868,dabcce5bbed52025ed376ed48daa4e3e085e76fe,0.514576
240151,a1b596ff4250f9d26da9a926623dbaecfc95d4ff,0.000302
92694,3e6d421c32d55fe54aeb63fa4df4994067e67a46,0.000317
