In [1]:
# import stuff

%matplotlib inline 

# libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
import numpy.random as rng
np.set_printoptions(precision=6)
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

#classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

#metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from time import time

# Fixing random state for reproducibility
np.random.seed(10)

# PART 2
## import data

In [2]:
aTr = pd.read_csv('a4-data\\part2classification\\adult.data', header=None)
aTe = pd.read_csv('a4-data\\part2classification\\adult.test', header=None)

col_labels = ['age','workclass','fnlwgt','education','education.num','marital.status','occupation','relationship','race','sex','capital.gain','capital.loss','hours.per.week','native.country','income']
aTr.columns=col_labels
aTe.columns=col_labels

## clean training and test set 'income' column

In [3]:
aTe['income'] = aTe['income'].str.replace(r'K.', 'K')
aTe['income'] = aTe['income'].str.replace(r' ', '')
aTr['income'] = aTr['income'].str.replace(r' ', '')
print(aTe['income'].values)
aTr['income'].values

['<=50K' '<=50K' '>50K' ... '<=50K' '<=50K' '>50K']


array(['<=50K', '<=50K', '<=50K', ..., '<=50K', '<=50K', '>50K'],
      dtype=object)

## check for missing vals

In [4]:
aTe.isna().any()

age               False
workclass         False
fnlwgt            False
education         False
education.num     False
marital.status    False
occupation        False
relationship      False
race              False
sex               False
capital.gain      False
capital.loss      False
hours.per.week    False
native.country    False
income            False
dtype: bool

## scale numerical columns

In [5]:
# define the columns to be scaled
cols_to_scale = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']


aTr_scaled = aTr
aTe_scaled = aTe

scaler = StandardScaler()

for col in cols_to_scale:
    colarray = np.asarray(aTr[col]).reshape(-1,1)
    colscaled = scaler.fit_transform(colarray)
    aTr_scaled[col] = colscaled
    colarray = np.asarray(aTe[col]).reshape(-1,1)
    colscaled = scaler.fit_transform(colarray)
    aTe_scaled[col] = colscaled

aTe_scaled.shape

(16281, 15)

## onehot encoding on categorical columns

In [6]:
# perform onehot encoding
cols = ['workclass','education','marital.status','occupation','relationship','race','sex','native.country']

aTr_encoded = aTr_scaled

aTr_encoded = pd.get_dummies(aTr_scaled, columns=cols)
aTe_encoded = pd.get_dummies(aTe_scaled, columns=cols)

## split the target variable from training and test sets

In [7]:
trX = aTr_encoded.drop(columns=['income'])
trY = aTr_encoded['income']
trX.shape

(32561, 108)

In [8]:
trY.shape

(32561,)

In [9]:
teX = aTe_encoded.drop(columns=['income'])
teY = aTe_encoded['income']

In [10]:
teX.shape

(16281, 107)

In [11]:
teY.shape

(16281,)

## insert 'native.country_ Holand-Netherlands' into test set

In [12]:
idx = trX.columns.get_loc('native.country_ Holand-Netherlands')
if ('native.country_ Holand-Netherlands' not in teX.columns):
    teX.insert(idx,'native.country_ Holand-Netherlands',0)
holand = 'native.country_ Holand-Netherlands'
teX[holand] = teX[holand].replace(0, False)
teX.shape

(16281, 108)

## encode positive class

In [13]:
trX_vals=trX.values.astype(np.float64)
trY_vals=trY.values
teX_vals=teX.values.astype(np.float64)
teY_vals=teY.values
teY_vals

array(['<=50K', '<=50K', '>50K', ..., '<=50K', '<=50K', '>50K'],
      dtype=object)

In [14]:
positive_class_label = ">50K"

teY_vals_ = np.where(teY_vals == positive_class_label, 1, 0)
trY_vals_ = np.where(trY_vals == positive_class_label, 1, 0)
teY_vals_

array([0, 0, 1, ..., 0, 0, 1])

## test classifiers

In [15]:
classifiers = {
    'KNeighbours Classifier':KNeighborsClassifier(),
    'Gaussian Naive Bayes':GaussianNB(),
    'SVC':SVC(),
    'Decision Tree Classifier':DecisionTreeClassifier(),
    'Random Forest Classifier':RandomForestClassifier(),
    'Ada Boost Classifier':AdaBoostClassifier(),
    'Gradient Boosting Classifier':GradientBoostingClassifier(),
    'Linear Discriminant Analysis':LinearDiscriminantAnalysis(),
    'MLPClassifier':MLPClassifier(),
    'Logisitic Regression':LogisticRegression()
}

cPerf = pd.DataFrame(columns=['Classifier','Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC'])

In [16]:
for key, model in classifiers.items():
    # Fit the model on the training data
    model.fit(trX_vals, trY_vals_)

    start_time = time()  # Record start time
    
    # Predict on the test data
    predictions = model.predict(teX_vals)
    
    end_time = time()  # Record end time

    # Calculate classification metrics
    accuracy = accuracy_score(teY_vals_, predictions)
    precision = precision_score(teY_vals_, predictions)
    recall = recall_score(teY_vals_, predictions)
    f1 = f1_score(teY_vals_, predictions)

    # For AUC predict class probabilities for binary classifiers
    if hasattr(model, 'predict_proba'):
        class_probs = model.predict_proba(teX_vals)[:, 1]  # Probability of the positive class
        auc = roc_auc_score(teY_vals_, class_probs)
    else:
        auc = None

    # store results for each algorithm      
    row = {'Classifier': key, 'Accuracy': accuracy, 'Precision': precision,'Recall': recall, 'F1-Score': f1, 'AUC': auc}
    cPerf.loc[len(cPerf)] = row

  cPerf.loc[len(cPerf)] = row
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
cPerf.round(2).head(10)

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score,AUC
0,KNeighbours Classifier,0.83,0.67,0.6,0.63,0.86
1,Gaussian Naive Bayes,0.55,0.34,0.94,0.5,0.76
2,SVC,0.86,0.76,0.59,0.67,
3,Decision Tree Classifier,0.81,0.6,0.61,0.6,0.74
4,Random Forest Classifier,0.85,0.71,0.59,0.65,0.9
5,Ada Boost Classifier,0.86,0.74,0.6,0.66,0.91
6,Gradient Boosting Classifier,0.87,0.79,0.6,0.68,0.92
7,Linear Discriminant Analysis,0.84,0.72,0.56,0.63,0.89
8,MLPClassifier,0.84,0.67,0.61,0.64,0.89
9,Logisitic Regression,0.85,0.73,0.6,0.66,0.91


## Performance Ratings

In [18]:
cPerf.drop(columns='Classifier').idxmax(axis=0)

Accuracy     6
Precision    6
Recall       1
F1-Score     6
AUC          6
dtype: int64

In [19]:
cPerf.drop(columns='Classifier').idxmin(axis=0)

Accuracy     1
Precision    1
Recall       7
F1-Score     1
AUC          3
dtype: int64

From the above, we can see that Gradient Boosting had the highest accuracy, precision, F1-Score and AUC. Gaussian Naive Bayes had the highest Recall. 

Meanwhile, GNB had the lowest accuracy, precision and F1-Score.
Linear Discriminant Analysis had the lowest Recall.
Decision tree classifier had the lowest AUC.

SVC did not have a value for AUC.