In [2]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import ensemble

data = pd.read_csv("Churnscore Inputdata F-IT.csv", header = 0, index_col = 0, sep = ";")

for col in data.columns:
        col_type = data[col].dtype
        if col_type != 'int64':
            data[col] = [re.sub(",", ".", str(x)) for x in data[col]]
            data[col] = data[col].astype(np.float64)
            
#creating two datasets for modelling
df_equal_1 = data[data.segment_24 == 1].copy()
df_equal_1.drop(columns = data.filter(regex = ("salesorg*")), inplace = True)
df_equal_1.drop(columns = ['prediction', 'prediction_01'], inplace = True)

df_over_1 = data[data.segment_24 > 1].copy()
df_over_1.drop(columns = data.filter(regex = ("salesorg*")), inplace = True)
df_over_1.drop(columns = ['prediction', 'prediction_01'], inplace = True)   

# Separating target column from other features
target = 'churn_36'
y_equal_1, y_over_1 = df_equal_1[target], df_over_1[target] #defining target
x_equal_1, x_over_1 = df_equal_1.drop(columns = target), df_over_1.drop(columns = target) #defining features

# Train and Test dataset splits
testing_size = 0.3

x_train_equal_1, x_test_equal_1, y_train_equal_1, y_test_equal_1 = train_test_split(x_equal_1, y_equal_1, test_size = testing_size, random_state = 42, stratify = y_equal_1)
x_train_over_1, x_test_over_1, y_train_over_1, y_test_over_1 = train_test_split(x_over_1, y_over_1, test_size = testing_size, random_state = 42, stratify = y_over_1)

# gradient boosting tree model hyper-tuned
GBT = ensemble.GradientBoostingClassifier()

#Stratified K-Fold Cross Validation
params = {
          'n_estimators':[200], #default=100
          'max_depth':[8], #default=3
          'learning_rate': [0.5], #default=0.1
#           'max_features': ['auto', 'sqrt', 'log2'], #default=None. If None, then max_features=n_features.
          'random_state' : [42],
          }

#---------------------------------------------------------------------------------------------------------------------
GBT_model_equal_1 = GridSearchCV(GBT, param_grid = params, cv = 3, n_jobs = -1).fit(x_train_equal_1, y_train_equal_1) 
GBT_model_over_1 = GridSearchCV(GBT, param_grid = params, cv = 3, n_jobs = -1).fit(x_train_over_1, y_train_over_1)

df_equal_1['prediction_grmk'] = GBT_model_equal_1.best_estimator_.predict(df_equal_1[x_test_equal_1.columns])
df_over_1['prediction_grmk'] = GBT_model_over_1.best_estimator_.predict(df_over_1[x_test_over_1.columns])

df_equal_1['probability_grmk'] = GBT_model_equal_1.best_estimator_.predict_proba(df_equal_1[x_test_equal_1.columns])[:,1]
df_equal_1['probability_grmk'] = df_equal_1['probability_grmk'].round(3)


df_over_1['probability_grmk'] = GBT_model_over_1.best_estimator_.predict_proba(df_over_1[x_test_over_1.columns])[:,1]
df_over_1['probability_grmk'] = df_over_1['probability_grmk'].round(3)

final_df = df_over_1.append(df_equal_1, ignore_index = False)
final_df = pd.merge(final_df,  data[['prediction', 'prediction_01']], left_index = True, right_index = True)
final_df = final_df.sort_index(axis = 0)

Unnamed: 0_level_0,month_n,cust_age,sales_net_eur,sales_net_eur_1,sales_net_eur_2,sales_net_eur_3,sales_net_eur_4,sales_net_eur_5,sales_net_eur_6,sales_net_eur_7,...,segment_24,rev_0_count,rev_not_0_count,turn_behav_counter,trend_counter,churn_36,prediction_grmk,probability_grmk,prediction,prediction_01
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,-1,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,1.0,16,0,5,5,1,1,0.999,0.97725,1
2,1,0,269.82000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,3.0,0,1,1,1,0,0,0.001,0.98967,1
3,1,0,1070.93000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,3.0,0,1,1,1,0,0,0.002,0.17243,0
4,1,0,2880.20997,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,3.0,0,1,1,1,0,0,0.110,0.20407,0
5,1,0,20513.62990,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,3.0,0,1,1,1,0,0,0.000,-0.09169,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76177,12,18,81158.16998,104911.52020,89924.49000,96110.94985,48458.69981,89589.54989,71212.01022,85084.11019,...,1.0,0,84,3,24,0,0,0.000,0.01743,0
76178,12,18,91897.74033,61118.41970,76370.54998,161636.29060,61500.40976,62949.99064,61764.53988,92632.16951,...,1.0,0,72,5,7,0,0,0.000,0.01308,0
76179,12,18,127969.28920,120752.35170,186322.50810,141822.72860,85806.98045,127172.55940,176443.86780,134749.03030,...,1.0,0,84,5,9,0,0,0.000,0.00697,0
76180,12,18,135665.35110,58194.08967,68786.86004,40272.55018,29789.53977,39347.63986,21517.90005,33902.39991,...,1.0,0,84,1,4,0,0,0.000,0.00548,0
