# XGBoost Gender Classification using Manual Fucntion

### Imports

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

### Load Data

In [22]:
df = pd.read_csv('gender_classification_v7.csv')

In [23]:
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,Male
0,1,11.8,6.1,1,0,1,1,1
1,0,14.0,5.4,0,0,1,0,0
2,0,11.8,6.3,1,1,1,1,1
3,0,14.4,6.1,0,1,1,1,1
4,1,13.5,5.9,0,0,0,0,0


### Split Data

In [24]:
X = df.drop('Male', axis=1)
y = df['Male']

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Try XGBoost with Default Parameters

In [27]:
from xgboost import XGBClassifier
XGBClassifier()

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [28]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
train_pred = xgb_model.predict(X_train) #To check the training accuracy

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print('Train Accuracy score is:')
print(accuracy_score(y_train, train_pred))
print('---------------------------------')
print('Test Accuracy score is:')
print(accuracy_score(y_test, xgb_pred))
print('---------------------------------')
print('Confusion matrix:')
print(confusion_matrix(y_test, xgb_pred))
print('---------------------------------')
print('Classification Report:')
print(classification_report(y_test, xgb_pred))

Train Accuracy score is:
0.9988571428571429
---------------------------------
Test Accuracy score is:
0.9600266489007329
---------------------------------
Confusion matrix:
[[711  28]
 [ 32 730]]
---------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       739
           1       0.96      0.96      0.96       762

    accuracy                           0.96      1501
   macro avg       0.96      0.96      0.96      1501
weighted avg       0.96      0.96      0.96      1501



## Manual Tuning

### Manually search for best parameters

In [30]:
def base_score():
    warn = 'The BEST VALUE is reaching the highest given parameter. You may alter the parameter for better performance.'
    base_score = [x/100 for x in range(40,81)] # 4,4.1,....8
    max_test_acc = 0
    best_value = 0
    for value in base_score:
        xgb_model = XGBClassifier(base_score=value)
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, xgb_pred)
        if test_accuracy > max_test_acc:
            max_test_acc = test_accuracy
            best_value = value
    if best_value == max(base_score):
        print(warn)
    return best_value

base_score = base_score()
base_score

0.49

In [31]:
def max_depth():
    warn = 'The BEST VALUE is reaching the highest given parameter. You may alter the parameter for better performance.'
    max_depth = [x for x in range(1,16)] # 1,2,...15
    max_test_acc = 0
    best_value = 0
    for value in max_depth:
        xgb_model = XGBClassifier(base_score=base_score, max_depth=value)
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, xgb_pred)
        if test_accuracy > max_test_acc:
            max_test_acc = test_accuracy
            best_value = value
    if best_value == max(max_depth):
        print(warn)
    return best_value

max_depth = max_depth()
max_depth

1

In [32]:
def subsample():
    warn = 'The BEST VALUE is reaching the highest given parameter. You may alter the parameter for better performance.'
    subsample = [x/100 for x in range(0,51)] # 0,0.01,0.02,....5.0
    max_test_acc = 0
    best_value = 0
    for value in subsample:
        xgb_model = XGBClassifier(base_score=base_score, max_depth=max_depth,subsample=value)
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, xgb_pred)
        if test_accuracy > max_test_acc:
            max_test_acc = test_accuracy
            best_value = value
    if best_value == max(subsample):
        print(warn)
    return best_value

subsample = subsample()
subsample

0.4

In [33]:
def n_estimators():
    warn = 'The BEST VALUE is reaching the highest given parameter. You may alter the parameter for better performance.'
    n_estimators = [x for x in range(10,500,20)] # 10,30,50,....990
    max_test_acc = 0
    best_value = 0
    for value in n_estimators:
        xgb_model = XGBClassifier(base_score=base_score, max_depth=max_depth,
                                  subsample=subsample,n_estimators=value)
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, xgb_pred)
        if test_accuracy > max_test_acc:
            max_test_acc = test_accuracy
            best_value = value
    if best_value == max(n_estimators):
        print(warn)
    return best_value

n_estimators = n_estimators()
n_estimators

30

In [34]:
def learning_rate():
    warn = 'The BEST VALUE is reaching the highest given parameter. You may alter the parameter for better performance.'
    learning_rate = [x/100 for x in range(5,105,5)] # 0.05,0.01,....1.0
    max_test_acc = 0
    best_value = 0
    for value in learning_rate:
        xgb_model = XGBClassifier(base_score=base_score, max_depth=max_depth,
                                  subsample=subsample,n_estimators=n_estimators,
                                  learning_rate=value)
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, xgb_pred)
        if test_accuracy > max_test_acc:
            max_test_acc = test_accuracy
            best_value = value
    if best_value == max(learning_rate):
        print(warn)
    return best_value

learning_rate = learning_rate()
learning_rate

0.9

In [35]:
def min_child_weight():
    warn = 'The BEST VALUE is reaching the highest given parameter. You may alter the parameter for better performance.'
    min_child_weight = [x/10 for x in range(1,301)] # 0.1,0.2,....30.0
    max_test_acc = 0
    best_value = 0
    for value in min_child_weight:
        xgb_model = XGBClassifier(base_score=base_score, max_depth=max_depth,
                                  subsample=subsample,n_estimators=n_estimators,
                                  learning_rate=learning_rate,min_child_weight=value)
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, xgb_pred)
        if test_accuracy > max_test_acc:
            max_test_acc = test_accuracy
            best_value = value
    if best_value == max(min_child_weight):
        print(warn)
    return best_value

min_child_weight = min_child_weight()
min_child_weight

0.1

In [36]:
def gamma():
    warn = 'The BEST VALUE is reaching the highest given parameter. You may alter the parameter for better performance.'
    gamma = [x/10 for x in range(1,101)] # 0.1,0.2,....10.0
    max_test_acc = 0
    best_value = 0
    for value in gamma:
        xgb_model = XGBClassifier(base_score=base_score, max_depth=max_depth,
                                  subsample=subsample,n_estimators=n_estimators,
                                  learning_rate=learning_rate,min_child_weight=min_child_weight,
                                  gamma=value)
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, xgb_pred)
        if test_accuracy > max_test_acc:
            max_test_acc = test_accuracy
            best_value = value
    if best_value == max(gamma):
        print(warn)
    return best_value

gamma = gamma()
gamma

0.1

In [37]:
def colsample_bytree():
    warn = 'The BEST VALUE is reaching the highest given parameter. You may alter the parameter for better performance.'
    colsample_bytree = [x/100 for x in range(0,101)] # 0.01,0.02,....1.0
    max_test_acc = 0
    best_value = 0
    for value in colsample_bytree:
        xgb_model = XGBClassifier(base_score=base_score, max_depth=max_depth,
                                  subsample=subsample,n_estimators=n_estimators,
                                  learning_rate=learning_rate,min_child_weight=min_child_weight,
                                  gamma=gamma,colsample_bytree=value)
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_test)
        test_accuracy = accuracy_score(y_test, xgb_pred)
        if test_accuracy > max_test_acc:
            max_test_acc = test_accuracy
            best_value = value
    if best_value == max(colsample_bytree):
        print(warn)
    return best_value

colsample_bytree = colsample_bytree()
colsample_bytree

0.0

In [38]:
print('base_score: ', base_score)
print('colsample_bytree: ', colsample_bytree)
print('gamma: ', gamma)
print('learning_rate: ', learning_rate)
print('max_depth: ', max_depth)
print('min_child_weight: ', min_child_weight)
print('n_estimators: ', n_estimators)
print('subsample: ', subsample)

base_score:  0.49
colsample_bytree:  0.0
gamma:  0.1
learning_rate:  0.9
max_depth:  1
min_child_weight:  0.1
n_estimators:  30
subsample:  0.4


### Train the model with best parameters

In [39]:
def best_xgb_model():
    xgb_model = XGBClassifier(base_score=base_score,colsample_bytree=colsample_bytree,
                                  gamma=gamma,learning_rate=learning_rate,
                                  max_depth=max_depth,min_child_weight=min_child_weight,
                                  n_estimators=n_estimators,subsample=subsample)
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict(X_test)
    train_pred = xgb_model.predict(X_train)
    return train_pred, xgb_pred

best_xgb_model()
train_pred = best_xgb_model()[0]
xgb_pred = best_xgb_model()[1]

In [40]:
print('Train Accuracy score is:')
print(accuracy_score(y_train, train_pred))
print('---------------------------------')
print('Test Accuracy score is:')
print(accuracy_score(y_test, xgb_pred))
print('---------------------------------')
print('Confusion matrix:')
print(confusion_matrix(y_test, xgb_pred))
print('---------------------------------')
print('Classification Report:')
print(classification_report(y_test, xgb_pred))

Train Accuracy score is:
0.9771428571428571
---------------------------------
Test Accuracy score is:
0.972018654230513
---------------------------------
Confusion matrix:
[[728  11]
 [ 31 731]]
---------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       739
           1       0.99      0.96      0.97       762

    accuracy                           0.97      1501
   macro avg       0.97      0.97      0.97      1501
weighted avg       0.97      0.97      0.97      1501

