# In this notebook we want to predict if a user will quit the expresso network or not

First let's charge the datasets

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data_path='data'
train=pd.read_csv('%s/Train.csv'%data_path)
test=pd.read_csv('%s/Test.csv'%data_path)
variable_definitions=pd.read_csv('%s/VariableDefinitions.csv'%data_path)

train.shape

(2154048, 19)

In [29]:
variable_definitions

Unnamed: 0,Variable Definitions,Unnamed: 1,Unnamed: 2
0,,French,English
1,,Le dataset churn comprend 19 variables dont 15...,The churn dataset includes 19 variables includ...
2,user_id,,
3,REGION,la localité de chaque client,the location of each client
4,TENURE,la durée dans le reseau,duration in the network
5,MONTANT,montant de recharge,top-up amount
6,FREQUENCE_RECH,nombre de fois que le client a fait une recharge,number of times the customer refilled
7,REVENUE,revenu mensuel de chaque client,monthly income of each client
8,ARPU_SEGMENT,revenu sur 90 jours/3,income over 90 days / 3
9,FREQUENCE,nombre de fois que client à fait un revenu,number of times the client has made an income


In [30]:
train["MRG"].unique()


array(['NO'], dtype=object)

MRG have a constant value donc let's drop in the dataset

In [31]:
train=train.drop("MRG",axis=1)
test=test.drop("MRG",axis=1)


In [32]:
train.head(5)

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,54,On net 200F=Unlimited _call24H,8.0,0
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,,I 18-21 month,,,,,,,,,,,,4,,,1
2,00001654a9d9f96303d9969d0a4a851714a4bb57,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,17,On-net 1000F=10MilF;10d,1.0,0
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,62,"Data:1000F=5GB,7d",11.0,0
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,11,Mixt 250F=Unlimited_call24H,2.0,0


checking the number of null value of each feature

In [33]:
train.isnull().sum(axis=0).sort_values()

user_id                 0
REGULARITY              0
CHURN                   0
TENURE                  0
REVENUE            726048
ARPU_SEGMENT       726048
FREQUENCE          726048
MONTANT            756739
FREQUENCE_RECH     756739
ON_NET             786675
REGION             849299
ORANGE             895248
FREQ_TOP_PACK      902594
TOP_PACK           902594
DATA_VOLUME       1060433
TIGO              1290016
ZONE1             1984327
ZONE2             2017224
dtype: int64

## Feature generation


In [34]:
train["FREQUENCE_BY_FREQUENCE_RECH"]=train["FREQUENCE"]/train["FREQUENCE_RECH"]
test["FREQUENCE_BY_FREQUENCE_RECH"]=test["FREQUENCE"]/test["FREQUENCE_RECH"]

In [35]:
train.head(5)

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN,FREQUENCE_BY_FREQUENCE_RECH
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,46.0,1.0,1.0,2.0,54,On net 200F=Unlimited _call24H,8.0,0,1.133333
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,,I 18-21 month,,,,,,,,,,,,4,,,1,
2,00001654a9d9f96303d9969d0a4a851714a4bb57,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,46.0,7.0,,,17,On-net 1000F=10MilF;10d,1.0,0,1.0
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,102.0,2.0,,,62,"Data:1000F=5GB,7d",11.0,0,1.2
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,24.0,,,,11,Mixt 250F=Unlimited_call24H,2.0,0,1.0


In [36]:
train["TOP_PACK"].unique()

array(['On net 200F=Unlimited _call24H', nan, 'On-net 1000F=10MilF;10d',
       'Data:1000F=5GB,7d', 'Mixt 250F=Unlimited_call24H',
       'MIXT:500F= 2500F on net _2500F off net;2d',
       'All-net 500F=2000F;5d', 'On-net 500F_FNF;3d',
       'Data: 100 F=40MB,24H', 'MIXT: 200mnoff net _unl on net _5Go;30d',
       'Jokko_Daily', 'Data: 200 F=100MB,24H', 'Data:490F=1GB,7d',
       'Twter_U2opia_Daily', 'On-net 500=4000,10d', 'Data:1000F=2GB,30d',
       'IVR Echat_Daily_50F', 'Pilot_Youth4_490',
       'All-net 500F =2000F_AllNet_Unlimited', 'Twter_U2opia_Weekly',
       'Data:200F=Unlimited,24H', 'On-net 200F=60mn;1d',
       'All-net 600F= 3000F ;5d', 'Pilot_Youth1_290',
       'All-net 1000F=(3000F On+3000F Off);5d', 'VAS(IVR_Radio_Daily)',
       'Data:3000F=10GB,30d', 'All-net 1000=5000;5d',
       'Twter_U2opia_Monthly', 'MIXT: 390F=04HOn-net_400SMS_400 Mo;4h\t',
       'FNF2 ( JAPPANTE)', 'Yewouleen_PKG', 'Data:150F=SPPackage1,24H',
       'WIFI_Family_2MBPS', 'Data:500F=2GB,2

In [37]:
test.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,TOP_PACK,FREQ_TOP_PACK,FREQUENCE_BY_FREQUENCE_RECH
0,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,THIES,K > 24 month,5000.0,5.0,5000.0,1667.0,5.0,,378.0,11.0,5.0,,,42,On-net 1000F=10MilF;10d,5.0,1.0
1,000055d41c8a62052dd426592e8a4a3342bf565d,,I 18-21 month,300.0,2.0,326.0,109.0,3.0,397.0,,0.0,,,,41,"Data: 100 F=40MB,24H",1.0,1.5
2,000081dd3245e6869a4a9c574c7050e7bb84c2c8,DAKAR,K > 24 month,3300.0,25.0,3400.0,1133.0,26.0,7150.0,0.0,2.0,5.0,,,57,"Data: 100 F=40MB,24H",22.0,1.04
3,0000b76d2145d9445d9ff6b65c9ebc4196c89337,,K > 24 month,,,,,,,,,,,,9,,,
4,0000bae5480628cf8fe51ad84bcb39772fc79224,,K > 24 month,,,,,,,,,,,,10,,,


for i in range (len(train)):
    if train.iloc[i]["sum_extra"]!=0:
        train.iloc[i]["RATIO_ON_NET_BY_EXTRA_NET"]=train.iloc[i]["ON_NET"]/(train.iloc[i]["sum_extra"])
    else:
        train.iloc[i]["RATIO_ON_NET_BY_EXTRA_NET"]=train.iloc[i]["ON_NET"]
for i in range (len(test)):
    if test.iloc[i]["sum_extra"]!=0:
        test.iloc[i]["RATIO_ON_NET_BY_EXTRA_NET"]=test.iloc[i]["ON_NET"]/(test.iloc[i]["sum_extra"])
    else:
        test.iloc[i]["RATIO_ON_NET_BY_EXTRA_NET"]=test.iloc[i]["ON_NET"]   

In [38]:
train["sum_extra"]=train["ORANGE"]+train["TIGO"]
test["sum_extra"]=test["ORANGE"]+test["TIGO"]
        


In [39]:
train["sum_extra"]=train["sum_extra"].map(lambda x : x if x!=0.0 else 0.5)
test["sum_extra"]=test["sum_extra"].map(lambda x : x if x!=0.0 else 0.5)

In [40]:
train["RATIO_ON_NET_BY_EXTRA_NET"]=train["ON_NET"]/train["sum_extra"] 
test["RATIO_ON_NET_BY_EXTRA_NET"]=test["ON_NET"]/test["sum_extra"] 

In [41]:
train["RATIO_ON_NET_BY_ORANGE"]=train["ON_NET"]/train["ORANGE"].map(lambda x : x if x!=0.0 else 0.5) 
train["RATIO_ON_NET_BY_TIGO"]=train["ON_NET"]/train["TIGO"].map(lambda x : x if x!=0.0 else 0.5) 
test["RATIO_ON_NET_BY_ORANGE"]=test["ON_NET"]/test["ORANGE"].map(lambda x : x if x!=0.0 else 0.5)
test["RATIO_ON_NET_BY_TIGO"]=test["ON_NET"]/test["TIGO"].map(lambda x : x if x!=0.0 else 0.5)

In [42]:
train["HYPOTHENUS_FREQ_AND_FREQ_RECH"]=np.square(train["FREQUENCE"])+np.square(train["FREQUENCE_RECH"])
test["HYPOTHENUS_FREQ_AND_FREQ_RECH"]=np.square(test["FREQUENCE"])+np.square(test["FREQUENCE_RECH"])

In [43]:
train["HYPOTHENUS_FREQ_AND_REGULARITY"]=np.square(train["FREQUENCE"])+np.square(train["REGULARITY"])
test["HYPOTHENUS_FREQ_AND_REGULARITY"]=np.square(test["FREQUENCE"])+np.square(test["REGULARITY"])

In [44]:


#generation of mean of the price of top pack
train["MEAN_PRICE_TOP_PACK"]=train["MONTANT"]/train["FREQ_TOP_PACK"]
test["MEAN_PRICE_TOP_PACK"]=test["MONTANT"]/test["FREQ_TOP_PACK"]

In [45]:
#generation of the ratio of the mensual revenuby the mean of the price of top pack
train["MENSUAL_BY_MEAN_PRICE_TOP_PACK"]=train["REVENUE"]/train["MEAN_PRICE_TOP_PACK"]
test["MENSUAL_BY_MEAN_PRICE_TOP_PACK"]=test["REVENUE"]/test["MEAN_PRICE_TOP_PACK"]

In [46]:
len(train.columns)
train.head(5)

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,CHURN,FREQUENCE_BY_FREQUENCE_RECH,sum_extra,RATIO_ON_NET_BY_EXTRA_NET,RATIO_ON_NET_BY_ORANGE,RATIO_ON_NET_BY_TIGO,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K > 24 month,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,...,0,1.133333,47.0,8.255319,8.434783,388.0,514.0,3205.0,531.25,8.001882
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,,I 18-21 month,,,,,,,,...,1,,,,,,,,,
2,00001654a9d9f96303d9969d0a4a851714a4bb57,,K > 24 month,3600.0,2.0,1020.0,340.0,2.0,,90.0,...,0,1.0,53.0,1.698113,1.956522,12.857143,8.0,293.0,3600.0,0.283333
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K > 24 month,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,...,0,1.2,104.0,0.394231,0.401961,20.5,549.0,4168.0,1227.272727,11.00163
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K > 24 month,1000.0,1.0,985.0,328.0,1.0,,39.0,...,0,1.0,,,1.625,,2.0,122.0,500.0,1.97


In [47]:
#Cleaning the TENURE attribut .We classify the tenure by K,I,J,et
train["TENURE"].unique()
train["TENURE"]=train["TENURE"].map(lambda x:x[0])
test["TENURE"]=test["TENURE"].map(lambda x:x[0])
train["REGION"]=train["REGION"].fillna("UNKNOW")
test["REGION"]=test["REGION"].fillna("UNKNOW")
train["TOP_PACK"]=train["TOP_PACK"].fillna("UNKNOW")
test["TOP_PACK"]=test["TOP_PACK"].fillna("UNKNOW")


In [48]:
#Let's combine the thre cathegorical variable
cat_cols_combined=["TENURE-REGION","REGION-TOP_PACK","TOP_PACK-TENURE"]
for attribs in cat_cols_combined:
    indexes=attribs.split("-")
    train[indexes[0]+"_"+indexes[1]]=train[indexes[0]]+"_"+train[indexes[1]]
    test[indexes[0]+"_"+indexes[1]]=test[indexes[0]]+"_"+test[indexes[1]]
    
train["TENURE_REGION_TOP_PACK"]=train["TENURE"]+"_"+train["REGION"]+"_"+train["TOP_PACK"]
test["TENURE_REGION_TOP_PACK"]=test["TENURE"]+"_"+test["REGION"]+"_"+test["TOP_PACK"]

In [49]:
train.head(5)

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,RATIO_ON_NET_BY_ORANGE,RATIO_ON_NET_BY_TIGO,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_REGION,REGION_TOP_PACK,TOP_PACK_TENURE,TENURE_REGION_TOP_PACK
0,00000bfd7d50f01092811bc0c8d7b0d6fe7c3596,FATICK,K,4250.0,15.0,4251.0,1417.0,17.0,4.0,388.0,...,8.434783,388.0,514.0,3205.0,531.25,8.001882,K_FATICK,FATICK_On net 200F=Unlimited _call24H,On net 200F=Unlimited _call24H_K,K_FATICK_On net 200F=Unlimited _call24H
1,00000cb4a5d760de88fecb38e2f71b7bec52e834,UNKNOW,I,,,,,,,,...,,,,,,,I_UNKNOW,UNKNOW_UNKNOW,UNKNOW_I,I_UNKNOW_UNKNOW
2,00001654a9d9f96303d9969d0a4a851714a4bb57,UNKNOW,K,3600.0,2.0,1020.0,340.0,2.0,,90.0,...,1.956522,12.857143,8.0,293.0,3600.0,0.283333,K_UNKNOW,UNKNOW_On-net 1000F=10MilF;10d,On-net 1000F=10MilF;10d_K,K_UNKNOW_On-net 1000F=10MilF;10d
3,00001dd6fa45f7ba044bd5d84937be464ce78ac2,DAKAR,K,13500.0,15.0,13502.0,4501.0,18.0,43804.0,41.0,...,0.401961,20.5,549.0,4168.0,1227.272727,11.00163,K_DAKAR,"DAKAR_Data:1000F=5GB,7d","Data:1000F=5GB,7d_K","K_DAKAR_Data:1000F=5GB,7d"
4,000028d9e13a595abe061f9b58f3d76ab907850f,DAKAR,K,1000.0,1.0,985.0,328.0,1.0,,39.0,...,1.625,,2.0,122.0,500.0,1.97,K_DAKAR,DAKAR_Mixt 250F=Unlimited_call24H,Mixt 250F=Unlimited_call24H_K,K_DAKAR_Mixt 250F=Unlimited_call24H


In [50]:
#The train set is very large let's get 800000 samples to train our model
import random
from sklearn.model_selection import train_test_split
random.seed(123)
sample= random.sample(range(0,len(train)), 500000)
train_sample=train.iloc[sample]
train_sample=train_sample.drop("user_id",axis=1)

In [51]:
train_sample

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,...,RATIO_ON_NET_BY_ORANGE,RATIO_ON_NET_BY_TIGO,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_REGION,REGION_TOP_PACK,TOP_PACK_TENURE,TENURE_REGION_TOP_PACK
219628,DIOURBEL,K,5500.0,9.0,5500.0,1833.0,9.0,9118.0,,,...,,,162.0,2106.0,5500.000000,1.000000,K_DIOURBEL,"DIOURBEL_Data:490F=1GB,7d","Data:490F=1GB,7d_K","K_DIOURBEL_Data:490F=1GB,7d"
1122719,DAKAR,K,1000.0,1.0,1000.0,333.0,2.0,,1.0,8.0,...,0.125000,,5.0,328.0,,,K_DAKAR,DAKAR_UNKNOW,UNKNOW_K,K_DAKAR_UNKNOW
365687,UNKNOW,K,,,,,,,,,...,,,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW
1708095,KAOLACK,K,,,,,,,,,...,,,,,,,K_KAOLACK,KAOLACK_UNKNOW,UNKNOW_K,K_KAOLACK_UNKNOW
1118007,UNKNOW,K,,,,,,,,,...,,,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013547,UNKNOW,K,,,,,,,,,...,,,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW
990945,DAKAR,K,8900.0,16.0,9401.0,3134.0,22.0,,224.0,151.0,...,1.483444,17.230769,740.0,3733.0,741.666667,12.675506,K_DAKAR,DAKAR_All-net 500F=2000F;5d,All-net 500F=2000F;5d_K,K_DAKAR_All-net 500F=2000F;5d
322134,DAKAR,K,17100.0,11.0,18390.0,6130.0,12.0,,1346.0,559.0,...,2.407871,14.473118,265.0,3988.0,4275.000000,4.301754,K_DAKAR,DAKAR_MIXT: 200mnoff net _unl on net _5Go;30d,MIXT: 200mnoff net _unl on net _5Go;30d_K,K_DAKAR_MIXT: 200mnoff net _unl on net _5Go;30d
2151229,UNKNOW,K,,,,,,,,,...,,,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW


# Regularization and mean encoding

In [None]:
"""from sklearn.model_selection import StratifiedKFold

y_tr=train_sample["CHURN"]
train_new=train_sample.copy()
train_new[cat_cols_mean]
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=123)
for tr_ind,val_ind in skf.split(train_sample,y_tr):
    X_tr,X_val=train_sample.iloc[tr_ind],train_sample.iloc[val_ind]
    for col in cat_cols:
        means=X_val[col].map(X_tr.groupby(col).CHURN.mean())
        X_val[col+'_MEAN_TARGET']=means
    #train_new.iloc[val_ind]=X_val
    train_new.loc[train_new.index[val_ind],cat_cols_mean]=X_val[cat_cols_mean]
prior=train_sample["CHURN"].mean()
#train_new.fillna(prior,inplace=True) 
train_new
"""

In [52]:
from sklearn import base
from sklearn.model_selection import KFold
class KFoldTargetEncoderTrain(base.BaseEstimator,
                               base.TransformerMixin):
    def __init__(self,colnames,targetName,
                  n_fold=5, verbosity=True,
                  discardOriginal_col=False):
        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)
        mean_of_target = X[self.targetName].mean()
        kf = KFold(n_splits = self.n_fold,
                   shuffle = True, random_state=2019)
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan
        for tr_ind, val_ind in kf.split(X):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            X.loc[X.index[val_ind], col_mean_name] =X_val[self.colnames].map(X_tr.groupby(self.colnames)
                                     [self.targetName].mean())
            X[col_mean_name].fillna(mean_of_target, inplace = True)
        if self.verbosity:
            encoded_feature = X[col_mean_name].values
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
        return X


In [54]:
cat_cols_rafined=['TENURE',
 'REGION']
new_train = train_sample.copy()
for col in cat_cols_rafined:
    targetc = KFoldTargetEncoderTrain(col,"CHURN",n_fold=5)
    new_train = targetc.fit_transform(new_train)

In [55]:
new_train.head(5)
new_train.dtypes

REGION                             object
TENURE                             object
MONTANT                           float64
FREQUENCE_RECH                    float64
REVENUE                           float64
ARPU_SEGMENT                      float64
FREQUENCE                         float64
DATA_VOLUME                       float64
ON_NET                            float64
ORANGE                            float64
TIGO                              float64
ZONE1                             float64
ZONE2                             float64
REGULARITY                          int64
TOP_PACK                           object
FREQ_TOP_PACK                     float64
CHURN                               int64
FREQUENCE_BY_FREQUENCE_RECH       float64
sum_extra                         float64
RATIO_ON_NET_BY_EXTRA_NET         float64
RATIO_ON_NET_BY_ORANGE            float64
RATIO_ON_NET_BY_TIGO              float64
HYPOTHENUS_FREQ_AND_FREQ_RECH     float64
HYPOTHENUS_FREQ_AND_REGULARITY    

In [56]:
# mean coding the test set
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        
    def fit(self, X, y=None):
        return self
    def transform(self,X):

        
        mean =  self.train[[self.colNames,
                self.encodedName]].groupby(
                                self.colNames).mean().reset_index() 
        print(mean)
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]
        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})
        return X

In [57]:
new_test=test.copy()
new_test

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,RATIO_ON_NET_BY_ORANGE,RATIO_ON_NET_BY_TIGO,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_REGION,REGION_TOP_PACK,TOP_PACK_TENURE,TENURE_REGION_TOP_PACK
0,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,THIES,K,5000.0,5.0,5000.0,1667.0,5.0,,378.0,...,34.363636,75.6,50.0,1789.0,1000.000000,5.000000,K_THIES,THIES_On-net 1000F=10MilF;10d,On-net 1000F=10MilF;10d_K,K_THIES_On-net 1000F=10MilF;10d
1,000055d41c8a62052dd426592e8a4a3342bf565d,UNKNOW,I,300.0,2.0,326.0,109.0,3.0,397.0,,...,,,13.0,1690.0,300.000000,1.086667,I_UNKNOW,"UNKNOW_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_I","I_UNKNOW_Data: 100 F=40MB,24H"
2,000081dd3245e6869a4a9c574c7050e7bb84c2c8,DAKAR,K,3300.0,25.0,3400.0,1133.0,26.0,7150.0,0.0,...,0.000000,0.0,1301.0,3925.0,150.000000,22.666667,K_DAKAR,"DAKAR_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_K","K_DAKAR_Data: 100 F=40MB,24H"
3,0000b76d2145d9445d9ff6b65c9ebc4196c89337,UNKNOW,K,,,,,,,,...,,,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW
4,0000bae5480628cf8fe51ad84bcb39772fc79224,UNKNOW,K,,,,,,,,...,,,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380122,fffe7e03c7eede2ad0a728ee516c4d342dd16107,DAKAR,K,4000.0,8.0,3999.0,1333.0,8.0,1587.0,26.0,...,0.104000,26.0,128.0,2873.0,800.000000,4.998750,K_DAKAR,DAKAR_Mixt 250F=Unlimited_call24H,Mixt 250F=Unlimited_call24H_K,K_DAKAR_Mixt 250F=Unlimited_call24H
380123,fffec230e6a1aa51ab37d0051ece42de611e71c6,UNKNOW,K,,,,,,,,...,,,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW
380124,ffff0dcc1ab9812bf205b6d76e9d084053cd96f5,UNKNOW,K,3950.0,7.0,3949.0,1316.0,10.0,1724.0,25.0,...,0.352113,,149.0,325.0,658.333333,5.998481,K_UNKNOW,UNKNOW_IVR Echat_Daily_50F,IVR Echat_Daily_50F_K,K_UNKNOW_IVR Echat_Daily_50F
380125,ffff91ea6a09a0c8ea42bc6ae33df4b5e06283dc,UNKNOW,K,3850.0,18.0,3955.0,1318.0,23.0,2962.0,0.0,...,0.000000,,853.0,1370.0,350.000000,11.300000,K_UNKNOW,"UNKNOW_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_K","K_UNKNOW_Data: 100 F=40MB,24H"


In [58]:

for col in cat_cols_rafined:
    test_targetc = KFoldTargetEncoderTest(new_train,
                                      col,
                                      col+'_Kfold_Target_Enc')
    new_test = test_targetc.fit_transform(new_test)

  TENURE  TENURE_Kfold_Target_Enc
0      D                 0.134964
1      E                 0.175105
2      F                 0.247292
3      G                 0.312336
4      H                 0.267059
5      I                 0.267591
6      J                 0.232440
7      K                 0.183263
         REGION  REGION_Kfold_Target_Enc
0         DAKAR                 0.019122
1      DIOURBEL                 0.027267
2        FATICK                 0.016561
3      KAFFRINE                 0.007062
4       KAOLACK                 0.022308
5      KEDOUGOU                 0.048766
6         KOLDA                 0.010690
7         LOUGA                 0.016348
8         MATAM                 0.021183
9   SAINT-LOUIS                 0.012655
10      SEDHIOU                 0.036558
11  TAMBACOUNDA                 0.015652
12        THIES                 0.016845
13       UNKNOW                 0.448232
14   ZIGUINCHOR                 0.028303


In [60]:
new_test.dtypes


user_id                            object
REGION                             object
TENURE                             object
MONTANT                           float64
FREQUENCE_RECH                    float64
REVENUE                           float64
ARPU_SEGMENT                      float64
FREQUENCE                         float64
DATA_VOLUME                       float64
ON_NET                            float64
ORANGE                            float64
TIGO                              float64
ZONE1                             float64
ZONE2                             float64
REGULARITY                          int64
TOP_PACK                           object
FREQ_TOP_PACK                     float64
FREQUENCE_BY_FREQUENCE_RECH       float64
sum_extra                         float64
RATIO_ON_NET_BY_EXTRA_NET         float64
RATIO_ON_NET_BY_ORANGE            float64
RATIO_ON_NET_BY_TIGO              float64
HYPOTHENUS_FREQ_AND_FREQ_RECH     float64
HYPOTHENUS_FREQ_AND_REGULARITY    

In [61]:
new_train.head(5)

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,...,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_REGION,REGION_TOP_PACK,TOP_PACK_TENURE,TENURE_REGION_TOP_PACK,TENURE_Kfold_Target_Enc,REGION_Kfold_Target_Enc
219628,DIOURBEL,K,5500.0,9.0,5500.0,1833.0,9.0,9118.0,,,...,162.0,2106.0,5500.0,1.0,K_DIOURBEL,"DIOURBEL_Data:490F=1GB,7d","Data:490F=1GB,7d_K","K_DIOURBEL_Data:490F=1GB,7d",0.183084,0.02729
1122719,DAKAR,K,1000.0,1.0,1000.0,333.0,2.0,,1.0,8.0,...,5.0,328.0,,,K_DAKAR,DAKAR_UNKNOW,UNKNOW_K,K_DAKAR_UNKNOW,0.183084,0.01888
365687,UNKNOW,K,,,,,,,,,...,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW,0.183271,0.448232
1708095,KAOLACK,K,,,,,,,,,...,,,,,K_KAOLACK,KAOLACK_UNKNOW,UNKNOW_K,K_KAOLACK_UNKNOW,0.18305,0.021929
1118007,UNKNOW,K,,,,,,,,,...,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW,0.18305,0.447872


# Correlation and vizualization

In [62]:
new_train.columns

Index(['REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE',
       'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO',
       'ZONE1', 'ZONE2', 'REGULARITY', 'TOP_PACK', 'FREQ_TOP_PACK', 'CHURN',
       'FREQUENCE_BY_FREQUENCE_RECH', 'sum_extra', 'RATIO_ON_NET_BY_EXTRA_NET',
       'RATIO_ON_NET_BY_ORANGE', 'RATIO_ON_NET_BY_TIGO',
       'HYPOTHENUS_FREQ_AND_FREQ_RECH', 'HYPOTHENUS_FREQ_AND_REGULARITY',
       'MEAN_PRICE_TOP_PACK', 'MENSUAL_BY_MEAN_PRICE_TOP_PACK',
       'TENURE_REGION', 'REGION_TOP_PACK', 'TOP_PACK_TENURE',
       'TENURE_REGION_TOP_PACK', 'TENURE_Kfold_Target_Enc',
       'REGION_Kfold_Target_Enc'],
      dtype='object')

In [63]:
new_train.describe()

Unnamed: 0,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,...,sum_extra,RATIO_ON_NET_BY_EXTRA_NET,RATIO_ON_NET_BY_ORANGE,RATIO_ON_NET_BY_TIGO,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_Kfold_Target_Enc,REGION_Kfold_Target_Enc
count,324260.0,324260.0,331503.0,331503.0,331503.0,253697.0,317669.0,292243.0,200578.0,39229.0,...,192387.0,183418.0,267365.0,189168.0,319165.0,331503.0,288682.0,288682.0,500000.0,500000.0
mean,5516.868374,11.51001,5497.334422,1832.450738,13.955077,3352.586,277.9884,94.761979,23.110585,8.431619,...,151.621887,7.468624,15.839372,99.139887,737.132753,2246.934124,938.207711,9.609301,0.187505,0.187506
std,7026.919283,13.222252,7148.399258,2382.795536,14.66136,13153.43,859.610861,200.254704,63.701545,39.922624,...,267.115529,83.471192,165.452419,539.563179,1579.427066,1954.61379,1279.566988,13.558757,0.019163,0.210218
min,10.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,2.0,2.0,3.389831,0.0055,0.122137,0.006253
25%,1000.0,2.0,1000.0,333.0,3.0,0.0,5.0,7.0,2.0,0.0,...,21.0,0.241935,0.306667,1.634582,20.0,485.0,500.0,2.0,0.18306,0.017457
50%,3000.0,7.0,3000.0,1000.0,9.0,254.0,27.0,29.0,6.0,1.0,...,64.0,0.841667,1.131148,7.174457,136.0,1882.0,637.5,5.0,0.183084,0.019397
75%,7300.0,16.0,7350.0,2450.0,20.0,2898.0,157.0,98.0,20.0,4.0,...,175.0,3.0,4.628959,34.5,666.0,3690.0,1000.0,12.024281,0.183271,0.447872
max,259500.0,133.0,397968.0,132656.0,91.0,1297464.0,29861.0,7660.0,3486.0,1804.0,...,7919.0,14456.0,23024.0,39640.0,25970.0,12125.0,120000.0,2006.885,0.314835,0.448842


In [64]:
new_train.corr()

Unnamed: 0,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,...,sum_extra,RATIO_ON_NET_BY_EXTRA_NET,RATIO_ON_NET_BY_ORANGE,RATIO_ON_NET_BY_TIGO,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_Kfold_Target_Enc,REGION_Kfold_Target_Enc
MONTANT,1.0,0.797861,0.98148,0.981479,0.779294,0.295914,0.336082,0.659647,0.410688,0.378936,...,0.674765,0.017693,0.021994,0.06957,0.747046,0.723307,0.220491,0.682485,-0.011907,-0.150462
FREQUENCE_RECH,0.797861,1.0,0.803534,0.803533,0.955811,0.147304,0.410133,0.519051,0.335939,0.138222,...,0.526151,0.02874,0.024443,0.119408,0.923964,0.816865,-0.066497,0.7993,0.000392,-0.154184
REVENUE,0.98148,0.803534,1.0,1.0,0.784766,0.304548,0.338823,0.660712,0.407464,0.367184,...,0.673581,0.018464,0.0231,0.070989,0.753288,0.728939,0.177861,0.701531,-0.011812,-0.156698
ARPU_SEGMENT,0.981479,0.803533,1.0,1.0,0.784765,0.304549,0.338823,0.660712,0.407464,0.367183,...,0.673581,0.018464,0.023101,0.070988,0.753288,0.728938,0.177861,0.701531,-0.011812,-0.156698
FREQUENCE,0.779294,0.955811,0.784766,0.784765,1.0,0.165239,0.400346,0.471147,0.301265,0.120462,...,0.472792,0.027906,0.020974,0.121533,0.905685,0.848633,-0.019365,0.784884,0.001873,-0.173853
DATA_VOLUME,0.295914,0.147304,0.304548,0.304549,0.165239,1.0,-0.014003,0.063276,0.018056,0.044143,...,0.050127,-0.013738,-0.014191,-0.034789,0.13376,0.196988,0.22983,0.108573,0.012572,-0.019416
ON_NET,0.336082,0.410133,0.338823,0.338823,0.400346,-0.014003,1.0,0.227353,0.13538,0.015763,...,0.215788,0.189062,0.198542,0.439023,0.401399,0.387708,-0.015569,0.335184,-0.004709,-0.079303
ORANGE,0.659647,0.519051,0.660712,0.660712,0.471147,0.063276,0.227353,1.0,0.399023,0.059221,...,0.975021,-0.033741,-0.038049,0.012454,0.502961,0.447604,0.076529,0.513085,-0.020798,-0.101511
TIGO,0.410688,0.335939,0.407464,0.407464,0.301265,0.018056,0.13538,0.399023,1.0,0.00955,...,0.592722,-0.020373,-0.012664,-0.057629,0.327708,0.288298,0.017449,0.331876,-0.01249,-0.053304
ZONE1,0.378936,0.138222,0.367184,0.367183,0.120462,0.044143,0.015763,0.059221,0.00955,1.0,...,0.05773,-0.005987,-0.005511,-0.003698,0.115551,0.086573,0.044787,0.19779,0.016971,-0.004523


In [None]:
#sns.pairplot(train_sample)
new_test

In [66]:
new_train["CHURN"]=new_train["CHURN"].astype("category")

for attri in cat_cols:
    new_train[attri]=new_train[attri].astype("category")
    new_test[attri]=new_test[attri].astype("category")

In [67]:
new_test.tail(10)

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,...,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_REGION,REGION_TOP_PACK,TOP_PACK_TENURE,TENURE_REGION_TOP_PACK,TENURE_Kfold_Target_Enc,REGION_Kfold_Target_Enc
380117,fffd954647f34b38b527dd2dfbfecb49edbdf8f7,THIES,K,20000.0,27.0,20001.0,6667.0,35.0,74350.0,55.0,...,1954.0,5069.0,1333.333333,15.00075,K_THIES,"THIES_Data:1000F=5GB,7d","Data:1000F=5GB,7d_K","K_THIES_Data:1000F=5GB,7d",0.183263,0.016845
380118,fffe4dfd57392c8d7ddfd48cb51bef3c8adb0892,UNKNOW,H,1000.0,2.0,991.0,330.0,2.0,1507.0,,...,8.0,20.0,500.0,1.982,H_UNKNOW,"UNKNOW_Data:490F=1GB,7d","Data:490F=1GB,7d_H","H_UNKNOW_Data:490F=1GB,7d",0.267059,0.448232
380119,fffe51167f1ad1bf26dda45ccfc40b5d7fab8384,SAINT-LOUIS,F,43500.0,75.0,44537.0,14846.0,75.0,0.0,1611.0,...,11250.0,9469.0,500.0,89.074,F_SAINT-LOUIS,SAINT-LOUIS_Jokko_Daily,Jokko_Daily_F,F_SAINT-LOUIS_Jokko_Daily,0.247292,0.012655
380120,fffe5c84e3db939182ee2a3a3123920025eca8a3,SAINT-LOUIS,K,3300.0,7.0,3299.0,1100.0,7.0,,80.0,...,98.0,2753.0,825.0,3.998788,K_SAINT-LOUIS,SAINT-LOUIS_All-net 500F=2000F;5d,All-net 500F=2000F;5d_K,K_SAINT-LOUIS_All-net 500F=2000F;5d,0.183263,0.012655
380121,fffe7c538a7d66446ee6f66c0b11b5446ec1be68,MATAM,K,4700.0,28.0,4915.0,1638.0,38.0,3942.0,1.0,...,2228.0,3944.0,142.424242,34.509574,K_MATAM,"MATAM_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_K","K_MATAM_Data: 100 F=40MB,24H",0.183263,0.021183
380122,fffe7e03c7eede2ad0a728ee516c4d342dd16107,DAKAR,K,4000.0,8.0,3999.0,1333.0,8.0,1587.0,26.0,...,128.0,2873.0,800.0,4.99875,K_DAKAR,DAKAR_Mixt 250F=Unlimited_call24H,Mixt 250F=Unlimited_call24H_K,K_DAKAR_Mixt 250F=Unlimited_call24H,0.183263,0.019122
380123,fffec230e6a1aa51ab37d0051ece42de611e71c6,UNKNOW,K,,,,,,,,...,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW,0.183263,0.448232
380124,ffff0dcc1ab9812bf205b6d76e9d084053cd96f5,UNKNOW,K,3950.0,7.0,3949.0,1316.0,10.0,1724.0,25.0,...,149.0,325.0,658.333333,5.998481,K_UNKNOW,UNKNOW_IVR Echat_Daily_50F,IVR Echat_Daily_50F_K,K_UNKNOW_IVR Echat_Daily_50F,0.183263,0.448232
380125,ffff91ea6a09a0c8ea42bc6ae33df4b5e06283dc,UNKNOW,K,3850.0,18.0,3955.0,1318.0,23.0,2962.0,0.0,...,853.0,1370.0,350.0,11.3,K_UNKNOW,"UNKNOW_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_K","K_UNKNOW_Data: 100 F=40MB,24H",0.183263,0.448232
380126,ffffb393b346f5348034e6e22be93778d94d4beb,DIOURBEL,K,,,,,,,0.0,...,,,,,K_DIOURBEL,DIOURBEL_UNKNOW,UNKNOW_K,K_DIOURBEL_UNKNOW,0.183263,0.027267


In [68]:
new_test.dtypes

user_id                             object
REGION                            category
TENURE                            category
MONTANT                            float64
FREQUENCE_RECH                     float64
REVENUE                            float64
ARPU_SEGMENT                       float64
FREQUENCE                          float64
DATA_VOLUME                        float64
ON_NET                             float64
ORANGE                             float64
TIGO                               float64
ZONE1                              float64
ZONE2                              float64
REGULARITY                           int64
TOP_PACK                          category
FREQ_TOP_PACK                      float64
FREQUENCE_BY_FREQUENCE_RECH        float64
sum_extra                          float64
RATIO_ON_NET_BY_EXTRA_NET          float64
RATIO_ON_NET_BY_ORANGE             float64
RATIO_ON_NET_BY_TIGO               float64
HYPOTHENUS_FREQ_AND_FREQ_RECH      float64
HYPOTHENUS_

In [77]:
test_prepared=new_test
test_prepared

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,...,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_REGION,REGION_TOP_PACK,TOP_PACK_TENURE,TENURE_REGION_TOP_PACK,TENURE_Kfold_Target_Enc,REGION_Kfold_Target_Enc
0,THIES,K,5000.0,5.0,5000.0,1667.0,5.0,,378.0,11.0,...,50.0,1789.0,1000.000000,5.000000,K_THIES,THIES_On-net 1000F=10MilF;10d,On-net 1000F=10MilF;10d_K,K_THIES_On-net 1000F=10MilF;10d,0.183263,0.016845
1,UNKNOW,I,300.0,2.0,326.0,109.0,3.0,397.0,,0.0,...,13.0,1690.0,300.000000,1.086667,I_UNKNOW,"UNKNOW_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_I","I_UNKNOW_Data: 100 F=40MB,24H",0.267591,0.448232
2,DAKAR,K,3300.0,25.0,3400.0,1133.0,26.0,7150.0,0.0,2.0,...,1301.0,3925.0,150.000000,22.666667,K_DAKAR,"DAKAR_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_K","K_DAKAR_Data: 100 F=40MB,24H",0.183263,0.019122
3,UNKNOW,K,,,,,,,,,...,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW,0.183263,0.448232
4,UNKNOW,K,,,,,,,,,...,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW,0.183263,0.448232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380122,DAKAR,K,4000.0,8.0,3999.0,1333.0,8.0,1587.0,26.0,250.0,...,128.0,2873.0,800.000000,4.998750,K_DAKAR,DAKAR_Mixt 250F=Unlimited_call24H,Mixt 250F=Unlimited_call24H_K,K_DAKAR_Mixt 250F=Unlimited_call24H,0.183263,0.019122
380123,UNKNOW,K,,,,,,,,,...,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW,0.183263,0.448232
380124,UNKNOW,K,3950.0,7.0,3949.0,1316.0,10.0,1724.0,25.0,71.0,...,149.0,325.0,658.333333,5.998481,K_UNKNOW,UNKNOW_IVR Echat_Daily_50F,IVR Echat_Daily_50F_K,K_UNKNOW_IVR Echat_Daily_50F,0.183263,0.448232
380125,UNKNOW,K,3850.0,18.0,3955.0,1318.0,23.0,2962.0,0.0,7.0,...,853.0,1370.0,350.000000,11.300000,K_UNKNOW,"UNKNOW_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_K","K_UNKNOW_Data: 100 F=40MB,24H",0.183263,0.448232


In [70]:
train_labels=new_train["CHURN"].copy()
train_prepared=new_train.drop("CHURN",axis=1)
train_prepared.dtypes

REGION                            category
TENURE                            category
MONTANT                            float64
FREQUENCE_RECH                     float64
REVENUE                            float64
ARPU_SEGMENT                       float64
FREQUENCE                          float64
DATA_VOLUME                        float64
ON_NET                             float64
ORANGE                             float64
TIGO                               float64
ZONE1                              float64
ZONE2                              float64
REGULARITY                           int64
TOP_PACK                          category
FREQ_TOP_PACK                      float64
FREQUENCE_BY_FREQUENCE_RECH        float64
sum_extra                          float64
RATIO_ON_NET_BY_EXTRA_NET          float64
RATIO_ON_NET_BY_ORANGE             float64
RATIO_ON_NET_BY_TIGO               float64
HYPOTHENUS_FREQ_AND_FREQ_RECH      float64
HYPOTHENUS_FREQ_AND_REGULARITY     float64
MEAN_PRICE_

In [71]:
cat_cols=["TENURE","REGION","TOP_PACK","TENURE_REGION","REGION_TOP_PACK","TOP_PACK_TENURE","TENURE_REGION_TOP_PACK"]
num_cols=[ x for x in train_prepared.columns if x not in cat_cols ]
num_cols

['MONTANT',
 'FREQUENCE_RECH',
 'REVENUE',
 'ARPU_SEGMENT',
 'FREQUENCE',
 'DATA_VOLUME',
 'ON_NET',
 'ORANGE',
 'TIGO',
 'ZONE1',
 'ZONE2',
 'REGULARITY',
 'FREQ_TOP_PACK',
 'FREQUENCE_BY_FREQUENCE_RECH',
 'sum_extra',
 'RATIO_ON_NET_BY_EXTRA_NET',
 'RATIO_ON_NET_BY_ORANGE',
 'RATIO_ON_NET_BY_TIGO',
 'HYPOTHENUS_FREQ_AND_FREQ_RECH',
 'HYPOTHENUS_FREQ_AND_REGULARITY',
 'MEAN_PRICE_TOP_PACK',
 'MENSUAL_BY_MEAN_PRICE_TOP_PACK',
 'TENURE_Kfold_Target_Enc',
 'REGION_Kfold_Target_Enc']

## Preprocessing

In [72]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
('std_scaler', StandardScaler())
])
full_pipeline = ColumnTransformer([
         ("num",num_pipeline,num_cols),
        ("cat", OneHotEncoder(), cat_cols)
],remainder='passthrough')



In [73]:
new_train.dtypes

REGION                            category
TENURE                            category
MONTANT                            float64
FREQUENCE_RECH                     float64
REVENUE                            float64
ARPU_SEGMENT                       float64
FREQUENCE                          float64
DATA_VOLUME                        float64
ON_NET                             float64
ORANGE                             float64
TIGO                               float64
ZONE1                              float64
ZONE2                              float64
REGULARITY                           int64
TOP_PACK                          category
FREQ_TOP_PACK                      float64
CHURN                             category
FREQUENCE_BY_FREQUENCE_RECH        float64
sum_extra                          float64
RATIO_ON_NET_BY_EXTRA_NET          float64
RATIO_ON_NET_BY_ORANGE             float64
RATIO_ON_NET_BY_TIGO               float64
HYPOTHENUS_FREQ_AND_FREQ_RECH      float64
HYPOTHENUS_

In [78]:
test_prepared.head(5)

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,...,HYPOTHENUS_FREQ_AND_FREQ_RECH,HYPOTHENUS_FREQ_AND_REGULARITY,MEAN_PRICE_TOP_PACK,MENSUAL_BY_MEAN_PRICE_TOP_PACK,TENURE_REGION,REGION_TOP_PACK,TOP_PACK_TENURE,TENURE_REGION_TOP_PACK,TENURE_Kfold_Target_Enc,REGION_Kfold_Target_Enc
0,THIES,K,5000.0,5.0,5000.0,1667.0,5.0,,378.0,11.0,...,50.0,1789.0,1000.0,5.0,K_THIES,THIES_On-net 1000F=10MilF;10d,On-net 1000F=10MilF;10d_K,K_THIES_On-net 1000F=10MilF;10d,0.183263,0.016845
1,UNKNOW,I,300.0,2.0,326.0,109.0,3.0,397.0,,0.0,...,13.0,1690.0,300.0,1.086667,I_UNKNOW,"UNKNOW_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_I","I_UNKNOW_Data: 100 F=40MB,24H",0.267591,0.448232
2,DAKAR,K,3300.0,25.0,3400.0,1133.0,26.0,7150.0,0.0,2.0,...,1301.0,3925.0,150.0,22.666667,K_DAKAR,"DAKAR_Data: 100 F=40MB,24H","Data: 100 F=40MB,24H_K","K_DAKAR_Data: 100 F=40MB,24H",0.183263,0.019122
3,UNKNOW,K,,,,,,,,,...,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW,0.183263,0.448232
4,UNKNOW,K,,,,,,,,,...,,,,,K_UNKNOW,UNKNOW_UNKNOW,UNKNOW_K,K_UNKNOW_UNKNOW,0.183263,0.448232


In [79]:
train_transformed=full_pipeline.fit_transform(train_prepared)


In [167]:
test_prepared.dtypes

REGION                                     category
TENURE                                     category
MONTANT                                     float64
FREQUENCE_RECH                              float64
REVENUE                                     float64
ARPU_SEGMENT                                float64
FREQUENCE                                   float64
DATA_VOLUME                                 float64
ON_NET                                      float64
ORANGE                                      float64
TIGO                                        float64
ZONE1                                       float64
ZONE2                                       float64
REGULARITY                                    int64
TOP_PACK                                   category
FREQ_TOP_PACK                               float64
FREQUENCE_BY_FREQUENCE_RECH                 float64
sum_extra                                   float64
RATIO_ON_NET_BY_EXTRA_NET                   float64
RATIO_ON_NET

In [342]:
test_prepared

In [80]:
test_transformed=full_pipeline.fit_transform(test_prepared)

In [None]:
test_prepared.columns,train_prepared.columns

# Dimensuality reduction, feature  extraction

In [77]:
X_ALL=train_prepared.copy()
X_ALL.shape
X_train_test=X_ALL.append(test_prepared)
X_train_test.shape

(880127, 29)

In [46]:
#X_train_test.iloc[999999],train_prepared.iloc[999999]

In [42]:
train_transformed

<500000x4286 sparse matrix of type '<class 'numpy.float64'>'
	with 14500000 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.decomposition import SparsePCA
n_components=5
alpha=0.0001
random_state=2018
n_jobs=-1
#sparse_pca=SparsePCA(n_components=5,alpha=alpha,random_state=random_state,n_jobs=n_jobs)
#sparse_pca.fit(train_transformed)
#X_train_sparsePCA=np.concatenate([train_transformed,test_transformed])


In [78]:
train_prepared.dtypes

REGION                            category
TENURE                            category
MONTANT                            float64
FREQUENCE_RECH                     float64
REVENUE                            float64
ARPU_SEGMENT                       float64
FREQUENCE                          float64
DATA_VOLUME                        float64
ON_NET                             float64
ORANGE                             float64
TIGO                               float64
ZONE1                              float64
ZONE2                              float64
REGULARITY                           int64
TOP_PACK                          category
FREQ_TOP_PACK                      float64
FREQUENCE_BY_FREQUENCE_RECH        float64
sum_extra                          float64
RATIO_ON_NET_BY_EXTRA_NET          float64
RATIO_ON_NET_BY_ORANGE             float64
RATIO_ON_NET_BY_TIGO               float64
HYPOTHENUS_FREQ_AND_FREQ_RECH      float64
HYPOTHENUS_FREQ_AND_REGULARITY     float64
MEAN_PRICE_

In [83]:
X_train_test_transformed=full_pipeline.fit_transform(X_train_test).toarray()

NameError: name 'X_train_test' is not defined

In [None]:
from sklearn.decomposition import PCA
#pca=PCA(n_components=2)
#pca.fit(X_train_test_transformed)

In [None]:
#X_train_pca=pca.transform(X_train_test_transformed[:500000])
#X_test_pca=pca.transform(X_train_test_transformed[500000:])


In [51]:
X_train_test_transformed

NameError: name 'X_train_test_transformed' is not defined

# Training XGboost model

In [46]:
!pip3 install xgboost



In [48]:
!brew install libomp

zsh:1: command not found: brew


In [49]:
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# CV model
model = xgboost.XGBClassifier(objective='logloss')
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, train_transformed, train_labels, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))




Accuracy: 87.94% (0.11%)


In [82]:
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# CV model
model = xgboost.XGBClassifier()
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, train_transformed, train_labels, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


Accuracy: 87.93% (0.09%)


In [89]:
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
param_test1 = {
 'max_depth' : range(1,10),
 'min_child_weight' : range(1,6)
}
xgb2 = xgboost.XGBClassifier(
        learning_rate =0.1, 
        n_estimators=65,
        max_depth=5,
        min_child_weight=1, 
        gamma=0, 
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=-1,
        scale_pos_weight=1, 
        seed=27
)

gsearch1 = GridSearchCV(
    estimator = xgb2, 
    param_grid = param_test1, scoring='neg_log_loss', n_jobs=-1, cv=5
)

gsearch1.fit(train_transformed, train_labels)
gsearch1.best_params_, gsearch1.best_score_





({'max_depth': 7, 'min_child_weight': 5}, -0.25260559367635915)

In [None]:
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
param_test1 = {
 'max_depth' : range(1,10),
 'min_child_weight' : range(1,6)
}
xgb2 = xgboost.XGBClassifier(
        learning_rate =0.1, 
        n_estimators=65,
        max_depth=5,
        min_child_weight=1, 
        gamma=0, 
        subsample=0.8, 
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=-1,
        scale_pos_weight=1, 
        seed=27
)

gsearch1 = GridSearchCV(
    estimator = xgb2, 
    param_grid = param_test1, scoring='neg_log_loss', n_jobs=-1, cv=5
)

gsearch1.fit(train_transformed, train_labels)
gsearch1.best_params_, gsearch1.best_score_

In [None]:
tr