# XGBoost Gender Classification using Manual Fucntion

### Imports

In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

### Load Data

In [151]:
df = pd.read_csv('gender_classification_v7.csv')

In [152]:
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,Male
0,1,11.8,6.1,1,0,1,1,1
1,0,14.0,5.4,0,0,1,0,0
2,0,11.8,6.3,1,1,1,1,1
3,0,14.4,6.1,0,1,1,1,1
4,1,13.5,5.9,0,0,0,0,0


### Split Data

In [153]:
X = df.drop('Male', axis=1)
y = df['Male']

In [154]:
from sklearn.model_selection import train_test_split

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Try XGBoost with Default Parameters

In [156]:
from xgboost import XGBClassifier
XGBClassifier()

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [157]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
train_pred = xgb_model.predict(X_train) #To check the training accuracy

In [158]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print('Train Accuracy score is:')
print(accuracy_score(y_train, train_pred))
print('---------------------------------')
print('Test Accuracy score is:')
print(accuracy_score(y_test, xgb_pred))
print('---------------------------------')
print('Confusion matrix:')
print(confusion_matrix(y_test, xgb_pred))
print('---------------------------------')
print('Classification Report:')
print(classification_report(y_test, xgb_pred))

Train Accuracy score is:
0.9988571428571429
---------------------------------
Test Accuracy score is:
0.9600266489007329
---------------------------------
Confusion matrix:
[[711  28]
 [ 32 730]]
---------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       739
           1       0.96      0.96      0.96       762

    accuracy                           0.96      1501
   macro avg       0.96      0.96      0.96      1501
weighted avg       0.96      0.96      0.96      1501



## Random Search Tuning

### Random search for best parameters

In [159]:
params = {
    'base_score':[0.4,0.41,0.42,0.43,0.44,0.45,0.46,0.47,0.48,0.49,0.5,0.51,0.52,0.53,0.54,0.55,0.56,0.57,0.58,0.59,0.6],
    'max_depth':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
    'subsample':[0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,0.43,0.44,0.45,0.46,0.47,0.48,0.49,0.5],
    'n_estimators': [20,40,60,80,100,120,140,160,180,200,220,240,260,280,300],
    'learning_rate':  [0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.7,0.8,0.9,1],
    'min_child_weight': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,3,4,5,6,7,8,9,10],
    'gamma': [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,8,9,10],
    }

In [160]:
from sklearn.model_selection import RandomizedSearchCV
xgb_search_model = XGBClassifier()

In [161]:
random_search = RandomizedSearchCV(xgb_search_model, param_distributions=params, 
    n_iter=10,scoring='roc_auc', n_jobs=-1,cv=5,verbose=3)

In [162]:
random_search.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.3s finished


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15],
                     

In [163]:
random_search.best_estimator_

XGBClassifier(base_score=0.57, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=9, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=11,
              min_child_weight=1.4, missing=nan, monotone_constraints='()',
              n_estimators=220, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.45,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [164]:
random_search.best_params_

{'subsample': 0.45,
 'n_estimators': 220,
 'min_child_weight': 1.4,
 'max_depth': 11,
 'learning_rate': 0.3,
 'gamma': 9,
 'base_score': 0.57}

In [165]:
xgb_model = XGBClassifier(base_score=0.48, gamma=7, learning_rate=0.35, max_depth=3,
              min_child_weight=1.3, n_estimators=120, subsample=0.37)

In [166]:
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
train_pred = xgb_model.predict(X_train)

In [167]:
print('Train Accuracy score is:')
print(accuracy_score(y_train, train_pred))
print('---------------------------------')
print('Test Accuracy score is:')
print(accuracy_score(y_test, xgb_pred))
print('---------------------------------')
print('Confusion matrix:')
print(confusion_matrix(y_test, xgb_pred))
print('---------------------------------')
print('Classification Report:')
print(classification_report(y_test, xgb_pred))

Train Accuracy score is:
0.9797142857142858
---------------------------------
Test Accuracy score is:
0.9693537641572285
---------------------------------
Confusion matrix:
[[728  11]
 [ 35 727]]
---------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       739
           1       0.99      0.95      0.97       762

    accuracy                           0.97      1501
   macro avg       0.97      0.97      0.97      1501
weighted avg       0.97      0.97      0.97      1501

