In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,ExtraTreesClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,log_loss
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

# Set all options
%matplotlib inline
plt.style.use('seaborn-notebook')
plt.rcParams["figure.figsize"] = (20, 3)
pd.options.display.float_format = '{:20,.4f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set(context="paper", font="monospace")

In [2]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [3]:
train_data.set_index('user_id', inplace = True)
test_data.set_index('user_id',inplace=True)

In [4]:
train_data.drop(['ZONE1', 'ZONE2','MRG'], axis = 1, inplace = True)
test_data.drop(['ZONE1', 'ZONE2', 'MRG'], axis = 1, inplace = True)

In [5]:
test_data['CHURN'] = 'test_data' #for easy separation

# Combine train and test to avoid repetition
data = pd.concat([train_data, test_data]).reset_index(drop=True)

In [6]:
missing_col = data.columns[data.isna().any()]

In [7]:
for col in missing_col.drop(['REGION', 'TOP_PACK']):
    
    data[col].fillna(999, inplace = True)

In [8]:
for col in [ 'TOP_PACK', 'REGION']:
    
    data[col].fillna(data[col].mode(),inplace = True)

In [9]:
tenure = data.drop('CHURN',1).groupby('TENURE').median()
ten_cols = tenure.columns.tolist()
ten_mapper = {}

for c in ten_cols:
  ten_mapper[c] = c+'_mean_on_ten'

In [10]:
tenure.rename(columns=ten_mapper, inplace=True)
tenure['TENURE'] = tenure.index
tenure.reset_index(drop=True, inplace=True)

In [11]:
top_pack = data.drop('CHURN',1).groupby('TOP_PACK').median()
tp_cols = top_pack.columns.tolist()
tp_mapper = {}

for c in tp_cols:
  tp_mapper[c] = c+'_mean_on_tp'

In [12]:
top_pack.rename(columns=tp_mapper, inplace=True)
top_pack['TOP_PACK'] = top_pack.index
top_pack.reset_index(drop=True, inplace=True)

In [13]:
region = data.drop('CHURN',1).groupby('REGION').median()

In [14]:
reg_cols = region.columns.tolist()
reg_mapper = {}

for c in reg_cols:
  reg_mapper[c] = c+'_mean_on_reg'

In [15]:
region.rename(columns=reg_mapper, inplace=True)
region['REGION'] = region.index
region.reset_index(drop=True, inplace=True)

In [13]:
data = data.merge(top_pack, on='TOP_PACK', how='left')

In [14]:
data = data.merge(tenure, on='TENURE', how='left')

In [18]:
data = data.merge(region, on='REGION', how='left')

In [15]:
data['TOT_REVENUE'] = data.REVENUE * data.FREQUENCE
data['TOT_SPENT'] = data.MONTANT * data.FREQUENCE_RECH
data['GROSS'] = data['TOT_REVENUE'] - data['TOT_SPENT']
data['REVENUE-MONTANT'] = data['REVENUE'] - data['MONTANT']
data['REVENUE/MONTANT'] = data['REVENUE'] / data['MONTANT']

In [16]:
from scipy.stats import skew,norm

In [17]:
numeric = data.select_dtypes(include='number').columns
skew_features = data[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.6]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head(10)

There are 17 numerical features with Skew > 0.5 :


REVENUE/MONTANT                           50.6464
DATA_VOLUME                               36.4281
REVENUE_mean_on_ten                       33.5095
MONTANT_mean_on_ten                       33.5085
FREQ_TOP_PACK_mean_on_ten                 11.8216
ORANGE_mean_on_ten                        10.2562
ON_NET                                     6.7320
FREQUENCE_RECH_mean_on_ten                 4.7596
FREQUENCE_mean_on_ten                      4.7388
ARPU_SEGMENT                               4.6681
dtype: float64

In [18]:
for column in skew_index:
    data[column] = np.log1p(data[column])

In [19]:
cat_features = data.select_dtypes(exclude=np.number).columns.drop('CHURN')

In [20]:
for col in cat_features:
    print('{} has {} unique categories'.format(col, data[col].nunique()))

REGION has 14 unique categories
TENURE has 8 unique categories
TOP_PACK has 112 unique categories


In [21]:
data = pd.get_dummies(columns= ['REGION', 'TENURE'], data = data)

In [22]:
#Label encoding
data['TOP_PACK']  = pd.factorize(data['TOP_PACK'])[0]

In [23]:
train_data = data.loc[( data['CHURN'] != 'test_data')].reset_index(drop = True)
test_data = data.loc[( data['CHURN'] == 'test_data')].reset_index(drop = True)

In [24]:
train_data.columns

Index(['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE',
       'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'REGULARITY', 'TOP_PACK',
       'FREQ_TOP_PACK', 'CHURN', 'MONTANT_mean_on_tp',
       'FREQUENCE_RECH_mean_on_tp', 'REVENUE_mean_on_tp',
       'ARPU_SEGMENT_mean_on_tp', 'FREQUENCE_mean_on_tp',
       'DATA_VOLUME_mean_on_tp', 'ON_NET_mean_on_tp', 'ORANGE_mean_on_tp',
       'TIGO_mean_on_tp', 'REGULARITY_mean_on_tp', 'FREQ_TOP_PACK_mean_on_tp',
       'MONTANT_mean_on_ten', 'FREQUENCE_RECH_mean_on_ten',
       'REVENUE_mean_on_ten', 'ARPU_SEGMENT_mean_on_ten',
       'FREQUENCE_mean_on_ten', 'DATA_VOLUME_mean_on_ten',
       'ON_NET_mean_on_ten', 'ORANGE_mean_on_ten', 'TIGO_mean_on_ten',
       'REGULARITY_mean_on_ten', 'FREQ_TOP_PACK_mean_on_ten', 'TOT_REVENUE',
       'TOT_SPENT', 'GROSS', 'REVENUE-MONTANT', 'REVENUE/MONTANT',
       'REGION_DAKAR', 'REGION_DIOURBEL', 'REGION_FATICK', 'REGION_KAFFRINE',
       'REGION_KAOLACK', 'REGION_KEDOUGOU', 'REGION_K

In [25]:
train_data['CHURN'] = train_data['CHURN'].apply(int)

In [26]:
in_cols = train_data.drop(columns=['CHURN']).columns
in_cols

Index(['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE',
       'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'REGULARITY', 'TOP_PACK',
       'FREQ_TOP_PACK', 'MONTANT_mean_on_tp', 'FREQUENCE_RECH_mean_on_tp',
       'REVENUE_mean_on_tp', 'ARPU_SEGMENT_mean_on_tp', 'FREQUENCE_mean_on_tp',
       'DATA_VOLUME_mean_on_tp', 'ON_NET_mean_on_tp', 'ORANGE_mean_on_tp',
       'TIGO_mean_on_tp', 'REGULARITY_mean_on_tp', 'FREQ_TOP_PACK_mean_on_tp',
       'MONTANT_mean_on_ten', 'FREQUENCE_RECH_mean_on_ten',
       'REVENUE_mean_on_ten', 'ARPU_SEGMENT_mean_on_ten',
       'FREQUENCE_mean_on_ten', 'DATA_VOLUME_mean_on_ten',
       'ON_NET_mean_on_ten', 'ORANGE_mean_on_ten', 'TIGO_mean_on_ten',
       'REGULARITY_mean_on_ten', 'FREQ_TOP_PACK_mean_on_ten', 'TOT_REVENUE',
       'TOT_SPENT', 'GROSS', 'REVENUE-MONTANT', 'REVENUE/MONTANT',
       'REGION_DAKAR', 'REGION_DIOURBEL', 'REGION_FATICK', 'REGION_KAFFRINE',
       'REGION_KAOLACK', 'REGION_KEDOUGOU', 'REGION_KOLDA', 'REGION_L

In [27]:
ycol = 'CHURN'

In [28]:
kf = KFold(n_splits=5, shuffle=False)

In [29]:
train_data['CHURN'].value_counts()

0    325156
1     74844
Name: CHURN, dtype: int64

In [30]:
scores = []
for train, test in kf.split(train_data):
    lgb = LGBMClassifier()
    lgb.fit(train_data[in_cols].iloc[train], train_data[ycol].iloc[train],verbose=50)
    lloss = log_loss(train_data[ycol].iloc[test], lgb.predict_proba(train_data[in_cols].iloc[test])[:, 1])
    scores.append(lloss)
    print(lloss)

print("Average score in 5-fold CV:", np.mean(scores))

0.2525587010469911
0.25267069664015296
0.25403631323533366
0.2558632785264341
0.25194761699846524
Average score in 5-fold CV: 0.2534153212894754


In [31]:

params = {
#     'n_estimators': 2000,
#     'learning_rate': 0.15,
    'objective': 'Logloss',
#     'rsm': 0.7,
    'random_seed': 2020,
    'early_stopping_rounds': 200,
#     'use_best_model': True,
    'metric_period':100
}

In [None]:
scores = []
for train, test in kf.split(train_data):
    lgb = CatBoostClassifier(**params)
    lgb.fit(train_data[in_cols].iloc[train], train_data[ycol].iloc[train],eval_set=[(train_data[in_cols].iloc[test], train_data[ycol].iloc[test])])
    lloss = log_loss(train_data[ycol].iloc[test], lgb.predict_proba(train_data[in_cols].iloc[test])[:, 1])
    scores.append(lloss)
    print(lloss)

print("Average score in 5-fold CV:", np.mean(scores))

Learning rate set to 0.142986




0:	learn: 0.5391395	test: 0.5389473	best: 0.5389473 (0)	total: 299ms	remaining: 4m 59s
100:	learn: 0.2525176	test: 0.2525156	best: 0.2525156 (100)	total: 13.7s	remaining: 2m 1s
200:	learn: 0.2498505	test: 0.2518138	best: 0.2518122 (199)	total: 27s	remaining: 1m 47s
300:	learn: 0.2480849	test: 0.2516983	best: 0.2516850 (261)	total: 40.2s	remaining: 1m 33s
400:	learn: 0.2466126	test: 0.2517735	best: 0.2516815 (301)	total: 53.4s	remaining: 1m 19s
500:	learn: 0.2453395	test: 0.2519543	best: 0.2516815 (301)	total: 1m 6s	remaining: 1m 6s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2516814575
bestIteration = 301

Shrink model to first 302 iterations.
0.251681457457433
Learning rate set to 0.142986




0:	learn: 0.5460348	test: 0.5457241	best: 0.5457241 (0)	total: 172ms	remaining: 2m 51s
100:	learn: 0.2523045	test: 0.2528773	best: 0.2528773 (100)	total: 13.7s	remaining: 2m 1s
200:	learn: 0.2497749	test: 0.2522764	best: 0.2522764 (200)	total: 27s	remaining: 1m 47s
300:	learn: 0.2480290	test: 0.2521865	best: 0.2521571 (277)	total: 40.4s	remaining: 1m 33s
400:	learn: 0.2465589	test: 0.2521751	best: 0.2521324 (371)	total: 53.9s	remaining: 1m 20s
500:	learn: 0.2452129	test: 0.2523216	best: 0.2521324 (371)	total: 1m 7s	remaining: 1m 6s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2521323509
bestIteration = 371

Shrink model to first 372 iterations.
0.25213235090127495
Learning rate set to 0.142986




0:	learn: 0.5453288	test: 0.5457151	best: 0.5457151 (0)	total: 173ms	remaining: 2m 52s
100:	learn: 0.2519236	test: 0.2544132	best: 0.2544132 (100)	total: 13.7s	remaining: 2m 1s
200:	learn: 0.2494512	test: 0.2538512	best: 0.2538243 (194)	total: 27.2s	remaining: 1m 48s
300:	learn: 0.2476809	test: 0.2539579	best: 0.2538101 (230)	total: 40.8s	remaining: 1m 34s
400:	learn: 0.2460484	test: 0.2540410	best: 0.2538101 (230)	total: 54.2s	remaining: 1m 21s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2538100977
bestIteration = 230

Shrink model to first 231 iterations.
0.2538100977108301
Learning rate set to 0.142986




0:	learn: 0.5396234	test: 0.5400546	best: 0.5400546 (0)	total: 165ms	remaining: 2m 45s
100:	learn: 0.2516618	test: 0.2559151	best: 0.2559151 (100)	total: 13.6s	remaining: 2m 1s


In [33]:
cat = CatBoostClassifier(**params)
cat.fit(train_data[in_cols], train_data[ycol])

Learning rate set to 0.091308
0:	learn: 0.5626761	total: 242ms	remaining: 4m 2s
100:	learn: 0.2517957	total: 16.6s	remaining: 2m 27s
200:	learn: 0.2504240	total: 32.5s	remaining: 2m 9s
300:	learn: 0.2493448	total: 48.5s	remaining: 1m 52s
400:	learn: 0.2483564	total: 1m 4s	remaining: 1m 36s
500:	learn: 0.2475298	total: 1m 20s	remaining: 1m 20s
600:	learn: 0.2467095	total: 1m 36s	remaining: 1m 4s
700:	learn: 0.2459561	total: 1m 52s	remaining: 48.1s
800:	learn: 0.2452932	total: 2m 9s	remaining: 32.1s
900:	learn: 0.2446214	total: 2m 25s	remaining: 16s
999:	learn: 0.2440026	total: 2m 41s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1ed014ad2e8>

In [34]:
test_data['CHURN'] = cat.predict_proba(test_data[in_cols])[:,1]
test_data.head()

Unnamed: 0,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN,MONTANT_mean_on_reg,FREQUENCE_RECH_mean_on_reg,REVENUE_mean_on_reg,ARPU_SEGMENT_mean_on_reg,FREQUENCE_mean_on_reg,DATA_VOLUME_mean_on_reg,ON_NET_mean_on_reg,ORANGE_mean_on_reg,TIGO_mean_on_reg,REGULARITY_mean_on_reg,FREQ_TOP_PACK_mean_on_reg,TOT_REVENUE,TOT_SPENT,GROSS,REVENUE-MONTANT,REVENUE/MONTANT,REGION_DAKAR,REGION_DIOURBEL,REGION_FATICK,REGION_KAFFRINE,REGION_KAOLACK,REGION_KEDOUGOU,REGION_KOLDA,REGION_LOUGA,REGION_MATAM,REGION_SAINT-LOUIS,REGION_SEDHIOU,REGION_TAMBACOUNDA,REGION_THIES,REGION_ZIGUINCHOR,TENURE_D 3-6 month,TENURE_E 6-9 month,TENURE_F 9-12 month,TENURE_G 12-15 month,TENURE_H 15-18 month,TENURE_I 18-21 month,TENURE_J 21-24 month,TENURE_K > 24 month
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,-1,0.0,0.7798,,,,,,,,,,,,0.0,0.0,0.0,0.0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0.0,0.0,2.3979,1.3863,0.6931,0.0,0.0,0.0,0.0,2,-1,0.0,0.6251,,,,,,,,,,,,2.3979,0.0,2.3979,10.0,inf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,7.5501,2.7726,7.7407,6.6425,3.091,6.0283,0.0,2.0794,1.0986,27,1,2.8904,0.1305,,,,,,,,,,,,10.7848,10.2577,9.8924,399.0,1.21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,8.0067,2.3026,7.8648,6.7673,2.7081,5.8081,0.0,3.1781,1.6094,46,12,1.3863,0.0006,7337.4817,14.4445,7382.6817,2460.8959,17.6607,5533.5742,248.5222,122.7052,26.8321,37.8616,11.4234,10.5035,10.2036,9.153,-397.0,0.8677,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,10.3735,3.8712,10.4043,9.3057,3.8712,0.0,4.8598,6.3208,5.6384,61,0,4.1897,0.0001,7337.4817,14.4445,7382.6817,2460.8959,17.6607,5533.5742,248.5222,122.7052,26.8321,37.8616,11.4234,14.2544,14.2236,10.7579,1000.0,1.0312,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [36]:
test = pd.read_csv('Test.csv')

In [37]:
test['CHURN'] = cat.predict_proba(test_data[in_cols])[:,1]
# test_data.head()

In [38]:
test[['user_id','CHURN']].to_csv('cat_2.csv', index=False)