In [1]:
# import
import pandas as pd

# read the file
df_adult = pd.read_csv('https://gist.githubusercontent.com/fmnobar/992233799dcbd9418f009b0d6c4422ee/raw/66d7cffec0308b71d4aeceb1a9d9763c8515311a/adult.data', header=None, names=[
        'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country', 'income'])

# return top 5 rows of the dataframe
df_adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [4]:
# import libraries
import pandas as pd
import time # this is used to calculate the training time of each model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [5]:
def load_and_preprocess_data(filepath):
    # read the data set into as a dataframe
    data = pd.read_csv(filepath, header=None, names=[
        'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country', 'income'])

    # initialize a label encoder
    label_encoder = LabelEncoder()

    # encode columns
    for col in data.select_dtypes(include=['object']).columns:
        data[col] = label_encoder.fit_transform(data[col])

    # Separate features as X and target as y
    X = data.drop('income', axis=1)
    y = data['income']

    return X, y

In [6]:
def train_and_evaluate_classifier(clf, X_train, y_train, X_test, y_test):
    # starting to track the time so that we can report the total training time
    start_time = time.time()

    # train the classifier
    clf.fit(X_train, y_train)

    # stop tracking time, now that training is completed
    end_time = time.time()

    # calculated how long training took
    elapsed_time = end_time - start_time

    # make predictions using the trained model. this will be used for evaluation of the trained model
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, 'predict_proba') else None

    # here are the metrics we use for evaluations
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'AUC-ROC': roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        'AUC-PR': average_precision_score(y_test, y_proba) if y_proba is not None else None,
        'Training Time (s)': elapsed_time
    }

    return metrics

In [8]:
def main():
    # initialize a dataframe to store the results in
    results_df = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC', 'AUC-PR', 'Training Time (s)'])

    # load and preprocess the data, using our previously-defined function
    X, y = load_and_preprocess_data('https://gist.githubusercontent.com/fmnobar/992233799dcbd9418f009b0d6c4422ee/raw/66d7cffec0308b71d4aeceb1a9d9763c8515311a/adult.data')

    # split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

    # initialize the classifiers
    classifiers = {
        'XGBoost': xgb.XGBClassifier(),
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(probability=True),
        'k-NN': KNeighborsClassifier(),
        'LightGBM': lgb.LGBMClassifier(),
        'CatBoost': CatBoostClassifier(verbose=0),
        'GradientBoosting': GradientBoostingClassifier(),
        'AdaBoost': AdaBoostClassifier()
    }

    # train and evaluate the trained classifiers
    for name, clf in classifiers.items():
        metrics = train_and_evaluate_classifier(clf, X_train, y_train, X_test, y_test)
        results_df.loc[name] = metrics

    # sort classifiers alphabetically
    results_df.sort_index(inplace=True)

    # add row numbers
    results_df.reset_index(drop=False, inplace=True)
    results_df.index.name = '#'

    return results_df

if __name__ == "__main__":
    df = main()

[LightGBM] [Info] Number of positive: 6313, number of negative: 19735
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 692
[LightGBM] [Info] Number of data points in the train set: 26048, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242360 -> initscore=-1.139783
[LightGBM] [Info] Start training from score -1.139783


In [9]:
df

Unnamed: 0_level_0,index,Accuracy,Precision,Recall,F1-Score,AUC-ROC,AUC-PR,Training Time (s)
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,AdaBoost,0.856595,0.747088,0.587696,0.657875,0.90712,0.781152,0.97293
1,CatBoost,0.873177,0.779459,0.640707,0.703305,0.922701,0.816062,8.364704
2,GradientBoosting,0.860894,0.771379,0.578534,0.661182,0.912957,0.796603,4.309609
3,LightGBM,0.872563,0.776108,0.642016,0.702722,0.92169,0.814042,0.620832
4,Random Forest,0.856134,0.738114,0.599476,0.661611,0.901514,0.767279,2.974992
5,SVM,0.802702,0.987952,0.160995,0.276871,0.622961,0.475018,132.437501
6,XGBoost,0.869492,0.760369,0.647906,0.699647,0.920417,0.812,0.342035
7,k-NN,0.777829,0.543974,0.32788,0.409147,0.682065,0.458023,0.094085


In [10]:
# sort results
df.sort_values(by=['F1-Score', 'AUC-ROC', 'Training Time (s)'], ascending=[False, False, True]).reset_index(drop=True).round(3)

Unnamed: 0,index,Accuracy,Precision,Recall,F1-Score,AUC-ROC,AUC-PR,Training Time (s)
0,CatBoost,0.873,0.779,0.641,0.703,0.923,0.816,8.365
1,LightGBM,0.873,0.776,0.642,0.703,0.922,0.814,0.621
2,XGBoost,0.869,0.76,0.648,0.7,0.92,0.812,0.342
3,Random Forest,0.856,0.738,0.599,0.662,0.902,0.767,2.975
4,GradientBoosting,0.861,0.771,0.579,0.661,0.913,0.797,4.31
5,AdaBoost,0.857,0.747,0.588,0.658,0.907,0.781,0.973
6,k-NN,0.778,0.544,0.328,0.409,0.682,0.458,0.094
7,SVM,0.803,0.988,0.161,0.277,0.623,0.475,132.438
