## Goal: The owner of an online shop wants to increase revenues. In particular, the owner is interested in making accurate predictions on whether her online customer will end up purchasing products.

In [1]:
import numpy as np 
import pandas as pd 
import io
from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import roc_auc_score

In [2]:
train_df = pd.read_csv("online_shop_train.csv")
test_df = pd.read_csv("online_shop_test_final.csv")

### First, let's take a look at how the data looks like.

In [3]:
train_df.head()

Unnamed: 0,ID,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,3935,1.0,50.333333,0.0,0.0,21.0,586.189286,0.0,0.005556,0.0,0.6,May,1,1,1,2,Returning_Visitor,0,0
1,7866,6.0,299.0,1.0,41.0,36.0,696.402778,0.0,0.002632,0.0,0.0,May,2,2,2,4,New_Visitor,1,0
2,3727,0.0,0.0,0.0,0.0,19.0,620.033333,0.0,0.007895,0.0,0.0,Feb,1,1,4,2,Returning_Visitor,0,0
3,4454,0.0,0.0,0.0,0.0,4.0,46.25,0.05,0.066667,0.0,0.0,Nov,3,2,6,13,Returning_Visitor,0,0
4,6186,0.0,0.0,0.0,0.0,24.0,671.088653,0.026667,0.036566,0.0,0.0,Nov,1,1,3,2,Returning_Visitor,0,0


In [4]:
test_df.head()

Unnamed: 0,ID,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,2762,2.0,341.6,0.0,0.0,7.0,177.9,0.047619,0.062857,0.0,0.0,Aug,3,2,1,1,Returning_Visitor,0
1,308,0.0,0.0,0.0,0.0,8.0,335.0,0.025,0.05,63.891,0.0,May,3,2,4,1,Returning_Visitor,0
2,1018,2.0,32.0,0.0,0.0,16.0,230.1,0.0,0.0125,0.0,0.0,May,2,2,2,1,Returning_Visitor,1
3,426,0.0,0.0,0.0,0.0,4.0,36.0,0.1,0.15,0.0,0.6,May,2,5,1,3,Returning_Visitor,0
4,2754,0.0,0.0,0.0,0.0,37.0,1229.766667,0.005405,0.014414,0.0,0.0,Mar,2,2,1,3,Returning_Visitor,1


### Check missing values.

In [5]:
train_df.info()
train_df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8630 entries, 0 to 8629
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       8630 non-null   int64  
 1   Administrative           8622 non-null   float64
 2   Administrative_Duration  8622 non-null   float64
 3   Informational            8622 non-null   float64
 4   Informational_Duration   8622 non-null   float64
 5   ProductRelated           8622 non-null   float64
 6   ProductRelated_Duration  8622 non-null   float64
 7   BounceRates              8622 non-null   float64
 8   ExitRates                8622 non-null   float64
 9   PageValues               8630 non-null   float64
 10  SpecialDay               8630 non-null   float64
 11  Month                    8630 non-null   object 
 12  OperatingSystems         8630 non-null   int64  
 13  Browser                  8630 non-null   int64  
 14  Region                  

(8630, 19)

In [6]:
test_df.info()
test_df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3700 entries, 0 to 3699
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       3700 non-null   int64  
 1   Administrative           3694 non-null   float64
 2   Administrative_Duration  3694 non-null   float64
 3   Informational            3694 non-null   float64
 4   Informational_Duration   3694 non-null   float64
 5   ProductRelated           3694 non-null   float64
 6   ProductRelated_Duration  3694 non-null   float64
 7   BounceRates              3694 non-null   float64
 8   ExitRates                3694 non-null   float64
 9   PageValues               3700 non-null   float64
 10  SpecialDay               3700 non-null   float64
 11  Month                    3700 non-null   object 
 12  OperatingSystems         3700 non-null   int64  
 13  Browser                  3700 non-null   int64  
 14  Region                  

(3700, 18)

In [7]:
train_df.isnull().any(axis = 1).sum()

8

In [8]:
test_df.isnull().any(axis = 1).sum()

6

### train_df has 8 rows with missing values and test_df has 6 rows with missing values.

### Now, It would be good to know how many customers generate revenue out of total 8630 customers in train_df.

In [9]:
 train_w_revenue_df = train_df[train_df.Revenue == 1]
 train_w_revenue_df

Unnamed: 0,ID,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
10,4641,3.0,224.300000,5.0,487.00,74.0,1845.958333,0.002532,0.017679,34.678415,0.0,Nov,2,2,1,10,Returning_Visitor,1,1
13,4443,11.0,631.416667,5.0,1037.15,501.0,21672.244250,0.003965,0.014292,9.131387,0.0,Nov,2,2,1,2,Returning_Visitor,0,1
14,6435,4.0,218.966667,0.0,0.00,12.0,672.700000,0.000000,0.004444,83.101143,0.0,Nov,2,2,3,2,Returning_Visitor,1,1
15,5828,2.0,69.000000,2.0,92.00,37.0,796.957143,0.000000,0.024274,3.978276,0.0,Mar,2,2,7,2,Returning_Visitor,0,1
20,10988,1.0,29.200000,1.0,322.40,10.0,126.933333,0.014286,0.036905,13.594200,0.0,Sep,2,2,1,2,Returning_Visitor,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8607,5023,0.0,0.000000,0.0,0.00,31.0,1144.972222,0.006667,0.008333,77.041998,0.0,Dec,2,2,7,2,New_Visitor,0,1
8609,4628,7.0,421.250000,0.0,0.00,59.0,1945.896667,0.012240,0.010579,28.529190,0.0,June,2,6,1,20,Returning_Visitor,0,1
8613,4852,0.0,0.000000,0.0,0.00,50.0,1803.768849,0.001361,0.017428,3.609399,0.0,Mar,2,4,7,3,Returning_Visitor,1,1
8623,6453,3.0,275.000000,1.0,29.50,30.0,1107.047619,0.000000,0.017761,35.981066,0.0,Mar,3,2,3,1,Returning_Visitor,1,1


### Only 1335 customers generate revenue. Thus, the data is highly imbalanced. 

In [10]:
train_w_revenue_df.isnull().sum()

ID                         0
Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

### Fortunately, there is no missing values with customers that generate revenue. 

### Before we move on, check 8 rows with missing values in train_df to see if there is any noticeable pattern.

In [11]:
null_data = train_df[train_df.isnull().any(axis=1)]
null_data

Unnamed: 0,ID,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
502,3934,,,,,,,,,0.0,0.0,Mar,2,2,1,2,Returning_Visitor,0,0
1597,7032,,,,,,,,,0.0,0.0,Mar,2,2,3,1,Returning_Visitor,0,0
1974,5695,,,,,,,,,0.0,0.0,Mar,1,1,1,2,Returning_Visitor,0,0
3801,8387,,,,,,,,,0.0,0.0,Mar,3,2,1,1,Returning_Visitor,0,0
3833,7446,,,,,,,,,0.0,0.0,Mar,2,4,5,1,Returning_Visitor,0,0
4270,5563,,,,,,,,,0.0,0.0,Mar,2,2,1,1,Returning_Visitor,1,0
6665,12293,,,,,,,,,0.0,0.0,Mar,2,2,2,1,Returning_Visitor,0,0
8252,10419,,,,,,,,,0.0,0.0,Mar,2,2,1,2,Returning_Visitor,0,0


### Now, create dummy variables for categorical variables in train_df and test_df.

### Create dummies for VisitorType.

In [12]:
train_df = pd.get_dummies(train_df, columns=['VisitorType'])

In [13]:
train_df

Unnamed: 0,ID,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,...,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor
0,3935,1.0,50.333333,0.0,0.00,21.0,586.189286,0.000000,0.005556,0.0,...,May,1,1,1,2,0,0,0,0,1
1,7866,6.0,299.000000,1.0,41.00,36.0,696.402778,0.000000,0.002632,0.0,...,May,2,2,2,4,1,0,1,0,0
2,3727,0.0,0.000000,0.0,0.00,19.0,620.033333,0.000000,0.007895,0.0,...,Feb,1,1,4,2,0,0,0,0,1
3,4454,0.0,0.000000,0.0,0.00,4.0,46.250000,0.050000,0.066667,0.0,...,Nov,3,2,6,13,0,0,0,0,1
4,6186,0.0,0.000000,0.0,0.00,24.0,671.088653,0.026667,0.036566,0.0,...,Nov,1,1,3,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8625,5935,0.0,0.000000,0.0,0.00,29.0,873.750000,0.000000,0.031034,0.0,...,Nov,2,2,5,2,1,1,0,0,1
8626,7455,10.0,198.243056,3.0,276.25,151.0,4344.919986,0.002532,0.011966,0.0,...,Dec,4,2,4,2,0,0,0,0,1
8627,7853,0.0,0.000000,0.0,0.00,7.0,431.533333,0.000000,0.007143,0.0,...,Oct,2,2,1,2,0,0,0,0,1
8628,7355,2.0,16.000000,0.0,0.00,16.0,381.686508,0.011765,0.046569,0.0,...,Feb,2,4,2,1,0,0,0,0,1


### Create dummies for Month.

In [14]:
train_df.Month.unique()

array(['May', 'Feb', 'Nov', 'Oct', 'Dec', 'Sep', 'Aug', 'Mar', 'Jul',
       'June'], dtype=object)

In [15]:
train_df = pd.get_dummies(train_df, columns=['Month'])
train_df

Unnamed: 0,ID,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,...,Month_Aug,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep
0,3935,1.0,50.333333,0.0,0.00,21.0,586.189286,0.000000,0.005556,0.0,...,0,0,0,0,0,0,1,0,0,0
1,7866,6.0,299.000000,1.0,41.00,36.0,696.402778,0.000000,0.002632,0.0,...,0,0,0,0,0,0,1,0,0,0
2,3727,0.0,0.000000,0.0,0.00,19.0,620.033333,0.000000,0.007895,0.0,...,0,0,1,0,0,0,0,0,0,0
3,4454,0.0,0.000000,0.0,0.00,4.0,46.250000,0.050000,0.066667,0.0,...,0,0,0,0,0,0,0,1,0,0
4,6186,0.0,0.000000,0.0,0.00,24.0,671.088653,0.026667,0.036566,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8625,5935,0.0,0.000000,0.0,0.00,29.0,873.750000,0.000000,0.031034,0.0,...,0,0,0,0,0,0,0,1,0,0
8626,7455,10.0,198.243056,3.0,276.25,151.0,4344.919986,0.002532,0.011966,0.0,...,0,1,0,0,0,0,0,0,0,0
8627,7853,0.0,0.000000,0.0,0.00,7.0,431.533333,0.000000,0.007143,0.0,...,0,0,0,0,0,0,0,0,1,0
8628,7355,2.0,16.000000,0.0,0.00,16.0,381.686508,0.011765,0.046569,0.0,...,0,0,1,0,0,0,0,0,0,0


### Let's do the same thing for test_df.

In [16]:
test_df = pd.get_dummies(test_df, columns=['VisitorType'])

In [17]:
test_df = pd.get_dummies(test_df, columns=['Month'])
test_df

Unnamed: 0,ID,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,...,Month_Aug,Month_Dec,Month_Feb,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep
0,2762,2.0,341.60,0.0,0.000000,7.0,177.900000,0.047619,0.062857,0.000,...,1,0,0,0,0,0,0,0,0,0
1,308,0.0,0.00,0.0,0.000000,8.0,335.000000,0.025000,0.050000,63.891,...,0,0,0,0,0,0,1,0,0,0
2,1018,2.0,32.00,0.0,0.000000,16.0,230.100000,0.000000,0.012500,0.000,...,0,0,0,0,0,0,1,0,0,0
3,426,0.0,0.00,0.0,0.000000,4.0,36.000000,0.100000,0.150000,0.000,...,0,0,0,0,0,0,1,0,0,0
4,2754,0.0,0.00,0.0,0.000000,37.0,1229.766667,0.005405,0.014414,0.000,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3695,1199,0.0,0.00,0.0,0.000000,3.0,42.000000,0.000000,0.066667,0.000,...,0,0,0,0,0,1,0,0,0,0
3696,1036,0.0,0.00,0.0,0.000000,2.0,0.000000,0.200000,0.200000,0.000,...,0,0,0,0,0,0,1,0,0,0
3697,14,4.0,104.50,4.0,261.833333,13.0,214.692208,0.033333,0.028760,0.000,...,0,0,0,0,0,0,1,0,0,0
3698,3396,4.0,52.25,0.0,0.000000,3.0,28.000000,0.000000,0.025000,0.000,...,0,0,0,0,0,0,0,1,0,0


### Since missing values represent less than 0.1% in train_df, let's fill missing values with 0.

In [18]:
train_df['Administrative'] = train_df['Administrative'].fillna(0)
train_df['Administrative_Duration'] = train_df['Administrative_Duration'].fillna(0)
train_df['Informational'] = train_df['Informational'].fillna(0)
train_df['Informational_Duration'] = train_df['Informational_Duration'].fillna(0)
train_df['ProductRelated'] = train_df['ProductRelated'].fillna(0)
train_df['ProductRelated_Duration'] = train_df['ProductRelated_Duration'].fillna(0)
train_df['BounceRates'] = train_df['BounceRates'].fillna(0)
train_df['ExitRates'] = train_df['ExitRates'].fillna(0)

In [19]:
train_df.isnull().sum()

ID                               0
Administrative                   0
Administrative_Duration          0
Informational                    0
Informational_Duration           0
ProductRelated                   0
ProductRelated_Duration          0
BounceRates                      0
ExitRates                        0
PageValues                       0
SpecialDay                       0
OperatingSystems                 0
Browser                          0
Region                           0
TrafficType                      0
Weekend                          0
Revenue                          0
VisitorType_New_Visitor          0
VisitorType_Other                0
VisitorType_Returning_Visitor    0
Month_Aug                        0
Month_Dec                        0
Month_Feb                        0
Month_Jul                        0
Month_June                       0
Month_Mar                        0
Month_May                        0
Month_Nov                        0
Month_Oct           

### Remove ID and Revenue from X DataFrame before we train models.

In [20]:
# delete ID
feature_names = train_df.columns.values[1:]

# delete revenue
revenue_index = np.argwhere(feature_names == 'Revenue')
feature_names = np.delete(feature_names, revenue_index)
feature_names

array(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated',
       'ProductRelated_Duration', 'BounceRates', 'ExitRates',
       'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser',
       'Region', 'TrafficType', 'Weekend', 'VisitorType_New_Visitor',
       'VisitorType_Other', 'VisitorType_Returning_Visitor', 'Month_Aug',
       'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar',
       'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep'], dtype=object)

In [21]:
X = train_df[feature_names]

In [22]:
Y = train_df[['Revenue']]

### Now, let's create XGBoost model.

In [60]:
clf_xgb = XGBClassifier(objective = 'binary:logistic')

param_dist = {'n_estimators': [100],
              'subsample': [0.7,0.8,0.9],
              'max_depth': [3, 4, 6, 8],
              'colsample_bytree': [0.7,0.8,0.9],
              'min_child_weight': [1,2,3,4],
              'gamma': [1,3,5],
              'eta': [0.0001,0.001,0.1]
             }


clf = RandomizedSearchCV(clf_xgb, param_distributions = param_dist, n_iter = 100, scoring = 'roc_auc', error_score = 0, verbose = 3, n_jobs = -1)

numFolds = 10
folds = KFold(n_splits = numFolds, shuffle = True)
print(folds)

estimators = []
results = np.zeros(len(X))
score = 0.0
for train_index, val_index in folds.split(X):
    print("new fold")
    X_train, X_val = X.iloc[train_index,:], X.iloc[val_index,:]
    y_train, y_val = Y.iloc[train_index].values.ravel(), Y.iloc[val_index].values.ravel()
    print("fitting")
    clf.fit(X_train, y_train)

    estimators.append(clf.best_estimator_)
    print(clf.best_estimator_)
    results[val_index] = clf.predict(X_val)
    auc_score = roc_auc_score(y_val, results[val_index])
    score += auc_score
    print("score",auc_score)
score /= numFolds

KFold(n_splits=10, random_state=None, shuffle=True)
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.7min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, eta=0.001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1)
score 0.8055567931211156
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.2min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1)
score 0.7740644668315537
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.5min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, eta=0.0001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
score 0.7526036981509245
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.3min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1)
score 0.806683562208994
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.4min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.0001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1)
score 0.7725259733445273
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.3min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.0001, gamma=5,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
score 0.783874018455895
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.3min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=4, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1)
score 0.7608822236871017
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.2min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, eta=0.0001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
score 0.7857044062438253
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.4min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=4, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
score 0.7692435301924353
new fold
fitting
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.5min finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.0001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=3, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
score 0.779612176721754


### Let's pick the best model with the highest AUC.

In [61]:
model = estimators[3]

In [62]:
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.001, gamma=7,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1)

### Now, make predictions with test_df (test set data).

In [63]:
feature_names = list(X.columns)
pred = model.predict(test_df[feature_names])

In [65]:
unique, counts = np.unique(pred, return_counts=True)
dict(zip(unique, counts))

{0: 3216, 1: 484}

In [66]:
pred = model.predict_proba(test_df[feature_names])[:,1]

In [68]:
submission = pd.DataFrame({'ID':test_df['ID'],'Revenue':pred})

submission.head(50)

Unnamed: 0,ID,Revenue
0,2762,0.02763
1,308,0.700384
2,1018,0.005495
3,426,0.00186
4,2754,0.00798
5,1663,0.002747
6,3473,0.002809
7,2060,0.023263
8,2445,0.002794
9,882,0.010118


In [69]:
filename = 'Predictions24.csv'

submission.to_csv(filename,index=False)


print('Saved file: ' + filename)

Saved file: Predictions24.csv
