Sample notebook

In [44]:
import os # It will open the default notebook location path
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.metrics import make_scorer, f1_score, classification_report

os.chdir(os.getcwd() + "/../../")
from age_conditions import DATA_PATH
from age_conditions.interfaces import (
    GetCVIndexer,
    HyperTunner,
    TrainTestSplitter,
    obtain_exhaustive_grid,
)

MODEL = 'xgboost'
hypers_model = obtain_exhaustive_grid(MODEL)

In [45]:
train = pd.read_csv(DATA_PATH / "train.csv")
test = pd.read_csv(DATA_PATH / "test.csv")
submission = pd.read_csv(DATA_PATH / "sample_submission.csv")
greeks = pd.read_csv(DATA_PATH / "greeks.csv")

print("Full train dataset shape is {}".format(train.shape))
print("Full test dataset shape is {}".format(test.shape))
display(train.head(2))
display(test.head(2))
display(greeks.head(2))

Full train dataset shape is (617, 58)
Full test dataset shape is (5, 57)


Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0


Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,Id,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,B,C,G,D,3/19/2019
1,007255e47698,A,C,M,B,Unknown


Dataset exploration

In [46]:
train.describe() # Since class has the 75th quantile non-centered to 0.5, it means there are more 0s than 1s

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
count,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,617.0,...,616.0,617.0,615.0,617.0,617.0,617.0,617.0,617.0,616.0,617.0
mean,0.477149,3502.013221,118.624513,38.968552,10.128242,5.545576,0.06032,10.566447,8.053012,5350.388655,...,5.433199,3.533905,0.421501,20.724856,131.714987,14679.595398,31.489716,50.584437,8.530961,0.175041
std,0.468388,2300.322717,127.83895,69.728226,10.518877,2.551696,0.416817,4.350645,65.166943,3021.326641,...,11.496257,50.181948,1.305365,9.991907,144.181524,19352.959387,9.864239,36.266251,10.32701,0.38031
min,0.081187,192.59328,85.200147,3.177522,8.138688,0.699861,0.025578,3.396778,1.2299,1693.62432,...,0.173229,0.49706,0.06773,4.102182,72.611063,13.038894,9.432735,0.897628,0.001129,0.0
25%,0.252107,2197.34548,85.200147,12.270314,8.138688,4.128294,0.025578,8.12958,1.2299,4155.70287,...,0.173229,0.49706,0.06773,14.036718,72.611063,2798.992584,25.034888,23.011684,0.124392,0.0
50%,0.354659,3120.31896,85.200147,20.53311,8.138688,5.031912,0.025578,10.46132,1.2299,4997.96073,...,3.028141,1.131,0.250601,18.771436,72.611063,7838.27361,30.608946,41.007968,0.337827,0.0
75%,0.559763,4361.63739,113.73954,39.139886,8.138688,6.431634,0.036845,12.969516,5.081244,6035.8857,...,6.238814,1.51206,0.535067,25.608406,127.591671,19035.70924,36.863947,67.931664,21.978,0.0
max,6.161666,28688.18766,1910.123198,630.51823,178.943634,38.27088,10.315851,38.971568,1463.693448,53060.59924,...,137.932739,1244.22702,31.365763,135.781294,1497.351958,143790.0712,81.210825,191.194764,21.978,1.0


In [47]:
# Define features & target
features = list(set(train.select_dtypes(include=['number']).columns) - set(['ID', 'Class']))
target = 'Class'
k_folds = 5

In [48]:
# Instead of downsampling, that will reduce our training set, we will estimate the class weights to give more
# importance to the underrated class

# Calculate the number of samples for each label.
neg, pos = np.bincount(train[target])

# Calculate total samples.
total = neg + pos

# Calculate the weight for each label.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

hypers_model['scale_pos_weight'] = [2] # Adjust to account for bias

Weight for class 0: 0.61
Weight for class 1: 2.86


Train-test split

In [49]:
X_train, y_train, X_test, y_test = TrainTestSplitter(train, features, target, 0.2).split_by_percentage()

# Ensure that y_train & y_test have at least some 1s
print(f'Sum of training target observations: {sum(y_train)}')
print(f'Sum of testing target observations: {sum(y_test)}')

# Define K-Folds
cv_indexes = GetCVIndexer(X_train).get_index_kfolds(k_folds)

Sum of training target observations: 28
Sum of testing target observations: 80


In [50]:
estimator = xgb.XGBClassifier(objective='binary:logistic',  random_state=42)

hp_tunner = HyperTunner(
    estimator,
    hypers_model,
    X_train,
    y_train,
    cv_indexes,
    make_scorer(f1_score, pos_label=1),
    0
)
best_params = hp_tunner.exhaustive_tunner(2)

In [51]:
# Fit the classifier to your training data
best_estimator = xgb.XGBClassifier(objective='binary:logistic',  random_state=42, **best_params)

model = best_estimator.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       414
           1       0.64      0.70      0.67        80

    accuracy                           0.89       494
   macro avg       0.79      0.81      0.80       494
weighted avg       0.89      0.89      0.89       494



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [52]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       414
           1       0.64      0.70      0.67        80

    accuracy                           0.89       494
   macro avg       0.79      0.81      0.80       494
weighted avg       0.89      0.89      0.89       494

