In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, balanced_accuracy_score,roc_auc_score,make_scorer



In [2]:
df = pd.read_excel('/kaggle/input/telco-customer-churn-ibm-dataset/Telco_customer_churn.xlsx')

In [3]:
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [4]:
df.shape

(7043, 33)

In [5]:
df.drop(['Churn Score','Churn Label','CLTV','Churn Reason'],axis = 1,inplace = True)
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Value
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,1
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,1
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,1


In [6]:
df.Count.unique()

array([1])

In [7]:
df.Country.unique()

array(['United States'], dtype=object)

In [8]:
df.State.unique()

array(['California'], dtype=object)

In [9]:
df.columns

Index(['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code',
       'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'Churn Value'],
      dtype='object')

In [10]:
df.drop(['Count','Country','State','CustomerID','Lat Long'],inplace = True,axis = 1)
df.head(3)

Unnamed: 0,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Value
0,Los Angeles,90003,33.964131,-118.272783,Male,No,No,No,2,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
1,Los Angeles,90005,34.059281,-118.30742,Female,No,No,Yes,2,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1
2,Los Angeles,90006,34.048013,-118.293953,Female,No,No,Yes,8,Yes,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,1


In [11]:
df['City'].replace(' ','_',regex = True,inplace = True)
df.head(2)

Unnamed: 0,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Value
0,Los_Angeles,90003,33.964131,-118.272783,Male,No,No,No,2,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
1,Los_Angeles,90005,34.059281,-118.30742,Female,No,No,Yes,2,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


**Removing white space from columns** 

In [12]:
df.columns = df.columns.str.replace(' ','_')
df.head()

Unnamed: 0,City,Zip_Code,Latitude,Longitude,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,...,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges,Churn_Value
0,Los_Angeles,90003,33.964131,-118.272783,Male,No,No,No,2,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
1,Los_Angeles,90005,34.059281,-118.30742,Female,No,No,Yes,2,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1
2,Los_Angeles,90006,34.048013,-118.293953,Female,No,No,Yes,8,Yes,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,1
3,Los_Angeles,90010,34.062125,-118.315709,Female,No,Yes,Yes,28,Yes,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,1
4,Los_Angeles,90015,34.039224,-118.266293,Male,No,No,Yes,49,Yes,...,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,1


In [13]:
df.isnull().sum().sort_values()

City                 0
Monthly_Charges      0
Payment_Method       0
Paperless_Billing    0
Contract             0
Streaming_Movies     0
Streaming_TV         0
Tech_Support         0
Device_Protection    0
Online_Backup        0
Online_Security      0
Internet_Service     0
Multiple_Lines       0
Phone_Service        0
Tenure_Months        0
Dependents           0
Partner              0
Senior_Citizen       0
Gender               0
Longitude            0
Latitude             0
Zip_Code             0
Total_Charges        0
Churn_Value          0
dtype: int64

One of the best part about xgboost is that, it knowns how to handel the missing data. Just put the missing data value to be zero. It does not matter if the columns already consists of zero as a known value, xgboost will still perform very well.

In [14]:
df.dtypes

City                  object
Zip_Code               int64
Latitude             float64
Longitude            float64
Gender                object
Senior_Citizen        object
Partner               object
Dependents            object
Tenure_Months          int64
Phone_Service         object
Multiple_Lines        object
Internet_Service      object
Online_Security       object
Online_Backup         object
Device_Protection     object
Tech_Support          object
Streaming_TV          object
Streaming_Movies      object
Contract              object
Paperless_Billing     object
Payment_Method        object
Monthly_Charges      float64
Total_Charges         object
Churn_Value            int64
dtype: object

In [15]:
df['Phone_Service'].unique()

array(['Yes', 'No'], dtype=object)

In [16]:
df.Total_Charges.unique()

array([108.15, 151.65, 820.5, ..., 7362.9, 346.45, 6844.5], dtype=object)

In [17]:
#df['Total_Charges'] = pd.to_numeric(df.Total_Charges)

The above code will give us an error with a column number which contains a white space rather a number

In [18]:
df['Total_Charges'][2230:2236]

2230     8003.8
2231    6130.85
2232       1415
2233    6201.95
2234           
2235    6597.25
Name: Total_Charges, dtype: object

In [19]:
len(df.loc[df.Total_Charges == " "])

11

In [20]:
df.loc[df.Total_Charges == " "]

Unnamed: 0,City,Zip_Code,Latitude,Longitude,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,...,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges,Churn_Value
2234,San_Bernardino,92408,34.084909,-117.258107,Female,No,Yes,No,0,No,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,0
2438,Independence,93526,36.869584,-118.189241,Male,No,No,No,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,0
2568,San_Mateo,94401,37.590421,-122.306467,Female,No,Yes,No,0,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,0
2667,Cupertino,95014,37.306612,-122.080621,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,0
2856,Redcrest,95569,40.363446,-123.835041,Female,No,Yes,No,0,No,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,0
4331,Los_Angeles,90029,34.089953,-118.294824,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,0
4687,Sun_City,92585,33.739412,-117.173334,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,0
5104,Ben_Lomond,95005,37.078873,-122.090386,Female,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,0
5719,La_Verne,91750,34.144703,-117.770299,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,0
6772,Bell,90201,33.970343,-118.171368,Female,No,Yes,Yes,0,Yes,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,0


You replace the " " with 0 by .replace as well

In [21]:
df.loc[(df.Total_Charges == " "),'Total_Charges'] = 0

In [22]:
df.loc[df.Tenure_Months == 0]

Unnamed: 0,City,Zip_Code,Latitude,Longitude,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,...,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges,Churn_Value
2234,San_Bernardino,92408,34.084909,-117.258107,Female,No,Yes,No,0,No,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,0,0
2438,Independence,93526,36.869584,-118.189241,Male,No,No,No,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,0,0
2568,San_Mateo,94401,37.590421,-122.306467,Female,No,Yes,No,0,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,0,0
2667,Cupertino,95014,37.306612,-122.080621,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,0,0
2856,Redcrest,95569,40.363446,-123.835041,Female,No,Yes,No,0,No,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,0,0
4331,Los_Angeles,90029,34.089953,-118.294824,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,0,0
4687,Sun_City,92585,33.739412,-117.173334,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,0,0
5104,Ben_Lomond,95005,37.078873,-122.090386,Female,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,0,0
5719,La_Verne,91750,34.144703,-117.770299,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,0,0
6772,Bell,90201,33.970343,-118.171368,Female,No,Yes,Yes,0,Yes,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,0,0


In [23]:
df.Total_Charges = pd.to_numeric(df.Total_Charges)
df.dtypes

City                  object
Zip_Code               int64
Latitude             float64
Longitude            float64
Gender                object
Senior_Citizen        object
Partner               object
Dependents            object
Tenure_Months          int64
Phone_Service         object
Multiple_Lines        object
Internet_Service      object
Online_Security       object
Online_Backup         object
Device_Protection     object
Tech_Support          object
Streaming_TV          object
Streaming_Movies      object
Contract              object
Paperless_Billing     object
Payment_Method        object
Monthly_Charges      float64
Total_Charges        float64
Churn_Value            int64
dtype: object

In [24]:
df.replace(' ','_',regex = True, inplace = True)
df.head(3)

Unnamed: 0,City,Zip_Code,Latitude,Longitude,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,...,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges,Churn_Value
0,Los_Angeles,90003,33.964131,-118.272783,Male,No,No,No,2,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed_check,53.85,108.15,1
1,Los_Angeles,90005,34.059281,-118.30742,Female,No,No,Yes,2,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic_check,70.7,151.65,1
2,Los_Angeles,90006,34.048013,-118.293953,Female,No,No,Yes,8,Yes,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic_check,99.65,820.5,1


In [25]:
X = df.drop('Churn_Value',axis = 1)
y = df['Churn_Value']
X.head()

Unnamed: 0,City,Zip_Code,Latitude,Longitude,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,...,Online_Backup,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges
0,Los_Angeles,90003,33.964131,-118.272783,Male,No,No,No,2,Yes,...,Yes,No,No,No,No,Month-to-month,Yes,Mailed_check,53.85,108.15
1,Los_Angeles,90005,34.059281,-118.30742,Female,No,No,Yes,2,Yes,...,No,No,No,No,No,Month-to-month,Yes,Electronic_check,70.7,151.65
2,Los_Angeles,90006,34.048013,-118.293953,Female,No,No,Yes,8,Yes,...,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic_check,99.65,820.5
3,Los_Angeles,90010,34.062125,-118.315709,Female,No,Yes,Yes,28,Yes,...,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic_check,104.8,3046.05
4,Los_Angeles,90015,34.039224,-118.266293,Male,No,No,Yes,49,Yes,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank_transfer_(automatic),103.7,5036.3


In [26]:
y.head(2)

0    1
1    1
Name: Churn_Value, dtype: int64

In [27]:
y.value_counts()

Churn_Value
0    5174
1    1869
Name: count, dtype: int64

In [28]:
5174/7043

0.7346301292063041

#### One hot encoding

**There are two most common - ColumnTransfer and pd.get_dummies. ColumnTransfer is complex and works totally differently. But get_dummies is obvious what we use regularly**

In [29]:
df.Payment_Method.nunique()

4

In [30]:
pd.get_dummies(X, columns = ['Payment_Method']).head()

Unnamed: 0,City,Zip_Code,Latitude,Longitude,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,...,Streaming_TV,Streaming_Movies,Contract,Paperless_Billing,Monthly_Charges,Total_Charges,Payment_Method_Bank_transfer_(automatic),Payment_Method_Credit_card_(automatic),Payment_Method_Electronic_check,Payment_Method_Mailed_check
0,Los_Angeles,90003,33.964131,-118.272783,Male,No,No,No,2,Yes,...,No,No,Month-to-month,Yes,53.85,108.15,False,False,False,True
1,Los_Angeles,90005,34.059281,-118.30742,Female,No,No,Yes,2,Yes,...,No,No,Month-to-month,Yes,70.7,151.65,False,False,True,False
2,Los_Angeles,90006,34.048013,-118.293953,Female,No,No,Yes,8,Yes,...,Yes,Yes,Month-to-month,Yes,99.65,820.5,False,False,True,False
3,Los_Angeles,90010,34.062125,-118.315709,Female,No,Yes,Yes,28,Yes,...,Yes,Yes,Month-to-month,Yes,104.8,3046.05,False,False,True,False
4,Los_Angeles,90015,34.039224,-118.266293,Male,No,No,Yes,49,Yes,...,Yes,Yes,Month-to-month,Yes,103.7,5036.3,True,False,False,False


In [31]:
X_encoded = pd.get_dummies(X,columns = [
    'City','Gender', 'Senior_Citizen','Partner','Dependents', 'Phone_Service','Multiple_Lines','Internet_Service','Online_Security',
    'Online_Backup','Device_Protection',
    'Tech_Support','Streaming_TV','Streaming_Movies','Contract','Paperless_Billing','Payment_Method'
],dtype = int)

X_encoded.head()

Unnamed: 0,Zip_Code,Latitude,Longitude,Tenure_Months,Monthly_Charges,Total_Charges,City_Acampo,City_Acton,City_Adelanto,City_Adin,...,Streaming_Movies_Yes,Contract_Month-to-month,Contract_One_year,Contract_Two_year,Paperless_Billing_No,Paperless_Billing_Yes,Payment_Method_Bank_transfer_(automatic),Payment_Method_Credit_card_(automatic),Payment_Method_Electronic_check,Payment_Method_Mailed_check
0,90003,33.964131,-118.272783,2,53.85,108.15,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
1,90005,34.059281,-118.30742,2,70.7,151.65,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,90006,34.048013,-118.293953,8,99.65,820.5,0,0,0,0,...,1,1,0,0,0,1,0,0,1,0
3,90010,34.062125,-118.315709,28,104.8,3046.05,0,0,0,0,...,1,1,0,0,0,1,0,0,1,0
4,90015,34.039224,-118.266293,49,103.7,5036.3,0,0,0,0,...,1,1,0,0,0,1,1,0,0,0


In [32]:
X_train,X_test,y_train,y_test = train_test_split(X_encoded,y,random_state = 42,stratify = y)

In [33]:
y_test.value_counts()

Churn_Value
0    1294
1     467
Name: count, dtype: int64

In [34]:
467/(1294+467)

0.26519023282226006

In [35]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic',missing = None, seed = 42,tree_method = 'gpu_hist',
            predictor = 'gpu_predictor')

clf_xgb.fit(X_train,
           y_train,
           verbose = True,
           early_stopping_rounds = 10,
            eval_metric = 'aucpr',
            eval_set = [(X_test,y_test)],
            
           )



[0]	validation_0-aucpr:0.64324
[1]	validation_0-aucpr:0.64650
[2]	validation_0-aucpr:0.65405
[3]	validation_0-aucpr:0.65789
[4]	validation_0-aucpr:0.66177
[5]	validation_0-aucpr:0.65740
[6]	validation_0-aucpr:0.66141
[7]	validation_0-aucpr:0.66468
[8]	validation_0-aucpr:0.66500
[9]	validation_0-aucpr:0.66723
[10]	validation_0-aucpr:0.66610
[11]	validation_0-aucpr:0.66769
[12]	validation_0-aucpr:0.66871
[13]	validation_0-aucpr:0.67203
[14]	validation_0-aucpr:0.67213
[15]	validation_0-aucpr:0.67335
[16]	validation_0-aucpr:0.67002
[17]	validation_0-aucpr:0.66828
[18]	validation_0-aucpr:0.66878
[19]	validation_0-aucpr:0.66653
[20]	validation_0-aucpr:0.66719
[21]	validation_0-aucpr:0.66455
[22]	validation_0-aucpr:0.66363
[23]	validation_0-aucpr:0.66136
[24]	validation_0-aucpr:0.66261


Practically, if we just predict **employee does not left** as our output for all. The predictions gives us near to 70% accurracy. Therefore, evaluation metrics like roc auc are efficient in detecting this non practical behaviour.

Moreover, we want to train our model in a way we can punish the model more if it does not predict the minority correctly.

parameters - maxdepth, learning rate , gamma , reg_lambda,

In [36]:
import warnings
warnings.filterwarnings("ignore", message="A NumPy version.*SciPy", category=UserWarning)


#Round 1
param_grid = {
    'max_depth':[3,4,5],
    'learning_rate':[0.1,0.01,0.05],
    'gamma':[0,0.25,1.0],
    'reg_lambda':[0,1,10],
    'scale_pos_weight':[1,3,5]
}
#Round 2
# param_grid = {
#     'max_depth':[4],
#     'learning_rate':[0.1,0.5,1],
#     'gamma':[0.25],
#     'reg_lambda':[10,20,100],
#     'scale_pos_weight':[3]
# }

optimal_params = GridSearchCV(
    estimator = xgb.XGBClassifier(objective = 'binary:logistic',
                                  seed = 42, 
                                  subsample = 0.9,
                                  colsample_bytree=0.5,
                                 tree_method = 'gpu_hist',
                                 predictor = 'gpu_predictor'),
    param_grid = param_grid,
    scoring = 'roc_auc',
    verbose = 2,#set it to 0, if you do not want to see
    n_jobs = 10,
    cv = 3
)

optimal_params.fit(X_train,
                  y_train,
                  early_stopping_rounds = 10,
                  eval_metric = 'auc',
                  eval_set = [(X_test,y_test)],
                  verbose = False,
                    )
optimal_params.best_params_

Fitting 3 folds for each of 243 candidates, totalling 729 fits




[CV] END gamma=0, learning_rate=0.1, max_depth=3, reg_lambda=0, scale_pos_weight=5; total time=   8.1s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, reg_lambda=1, scale_pos_weight=3; total time=   6.9s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, reg_lambda=10, scale_pos_weight=3; total time=   7.6s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=0, scale_pos_weight=5; total time=   5.9s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=1, scale_pos_weight=5; total time=   6.4s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=10, scale_pos_weight=5; total time=   6.5s
[CV] END gamma=0, learning_rate=0.1, max_depth=5, reg_lambda=1, scale_pos_weight=1; total time=   6.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=5, reg_lambda=10, scale_pos_weight=1; total time=   7.3s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, reg_lambda=0, scale_pos_weight=1; total time=   6.5s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, reg_lambda=1, scal



[CV] END gamma=0, learning_rate=0.1, max_depth=3, reg_lambda=0, scale_pos_weight=3; total time=   8.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=3, reg_lambda=1, scale_pos_weight=3; total time=   8.1s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=0, scale_pos_weight=1; total time=   6.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=0, scale_pos_weight=3; total time=   6.2s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=1, scale_pos_weight=5; total time=   7.0s
[CV] END gamma=0, learning_rate=0.1, max_depth=5, reg_lambda=0, scale_pos_weight=1; total time=   6.0s
[CV] END gamma=0, learning_rate=0.1, max_depth=5, reg_lambda=1, scale_pos_weight=1; total time=   5.9s
[CV] END gamma=0, learning_rate=0.1, max_depth=5, reg_lambda=10, scale_pos_weight=1; total time=   8.2s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, reg_lambda=0, scale_pos_weight=5; total time=   6.9s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, reg_lambda=1, scale_



[CV] END gamma=0, learning_rate=0.1, max_depth=3, reg_lambda=1, scale_pos_weight=1; total time=   9.9s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=0, scale_pos_weight=1; total time=   6.4s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=1, scale_pos_weight=1; total time=   6.6s
[CV] END gamma=0, learning_rate=0.1, max_depth=4, reg_lambda=10, scale_pos_weight=3; total time=   7.1s
[CV] END gamma=0, learning_rate=0.1, max_depth=5, reg_lambda=0, scale_pos_weight=3; total time=   5.3s
[CV] END gamma=0, learning_rate=0.1, max_depth=5, reg_lambda=1, scale_pos_weight=3; total time=   5.7s
[CV] END gamma=0, learning_rate=0.1, max_depth=5, reg_lambda=10, scale_pos_weight=5; total time=   6.5s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, reg_lambda=0, scale_pos_weight=5; total time=   6.0s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, reg_lambda=1, scale_pos_weight=5; total time=   5.3s
[CV] END gamma=0, learning_rate=0.01, max_depth=3, reg_lambda=10, sca



{'gamma': 1.0,
 'learning_rate': 0.05,
 'max_depth': 4,
 'reg_lambda': 10,
 'scale_pos_weight': 3}

Using the optimized parameters

In [37]:
clf_xgb = xgb.XGBClassifier(seed = 42,
                           objective = 'binary:logistic',
                           gamma = 0.25,
                           learn_rate = 0.1,
                           max_depth = 4,
                           reg_lambda = 10,
                           scale_pos_weight = 3,
                           sub_sample = 0.9,
                           colsample_bytree = 0.5,
                           tree_method = 'gpu_hist',
                           predictor = 'gpu_predictor'
                           )

clf_xgb.fit(X_train,
           y_train,
           verbose = True,
           eval_metric = 'aucpr',
           early_stopping_rounds = 10,
           eval_set = [(X_test,y_test)],
           )



Parameters: { "learn_rate", "sub_sample" } are not used.

[0]	validation_0-aucpr:0.51256
[1]	validation_0-aucpr:0.61283
[2]	validation_0-aucpr:0.61501
[3]	validation_0-aucpr:0.62293
[4]	validation_0-aucpr:0.62685
[5]	validation_0-aucpr:0.63129
[6]	validation_0-aucpr:0.65078
[7]	validation_0-aucpr:0.64883
[8]	validation_0-aucpr:0.64095
[9]	validation_0-aucpr:0.64794
[10]	validation_0-aucpr:0.65562
[11]	validation_0-aucpr:0.65556
[12]	validation_0-aucpr:0.65885
[13]	validation_0-aucpr:0.65940
[14]	validation_0-aucpr:0.66027
[15]	validation_0-aucpr:0.65963
[16]	validation_0-aucpr:0.65939
[17]	validation_0-aucpr:0.66099
[18]	validation_0-aucpr:0.65962
[19]	validation_0-aucpr:0.66261
[20]	validation_0-aucpr:0.66413
[21]	validation_0-aucpr:0.66453
[22]	validation_0-aucpr:0.66506
[23]	validation_0-aucpr:0.66540
[24]	validation_0-aucpr:0.66578
[25]	validation_0-aucpr:0.66566
[26]	validation_0-aucpr:0.66479
[27]	validation_0-aucpr:0.66422
[28]	validation_0-aucpr:0.66504
[29]	validation_0-aucpr: