Load the dataset and handle missing values

In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype

DATASET_PATH = "./income_dataset.csv"
df = pd.read_csv(DATASET_PATH, sep=",")

# handle null values in the dataset
df.replace('?', np.nan)
df["age"] = pd.to_numeric(df["age"], errors="coerce")

# replace null values with the median
print(df.isnull().sum())
df = df.apply(lambda x: x.fillna(x.median()) if is_numeric_dtype(x) else x.fillna(x.mode().iloc[0]))
print(df.isnull().sum())

age                        4
workclass                  0
fnlwgt                     0
education                  0
education_num              0
marital_status             0
occupation                 0
relationship               0
race                       0
sex                        0
capital_gain               0
capital_loss               0
hours_per_week             0
native_country             0
income_greater_than_50k    0
dtype: int64
age                        0
workclass                  0
fnlwgt                     0
education                  0
education_num              0
marital_status             0
occupation                 0
relationship               0
race                       0
sex                        0
capital_gain               0
capital_loss               0
hours_per_week             0
native_country             0
income_greater_than_50k    0
dtype: int64


Encoding categorical features: one-hot encoding (create a binary vector for each category)

In [2]:
# get all columns which are not numeric and not the loan status
categorical_features = [col for col in df.columns if not is_numeric_dtype(df[col])]
df_onehot = pd.get_dummies(df, columns=categorical_features)

print(df["income_greater_than_50k"].value_counts()[1])
print(df["income_greater_than_50k"].value_counts()[-1])

252
748


Building a predictive model

In [3]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# use the index location function on all the rows and all the columns
# except the last one
X = df_onehot.iloc[:, :-1]

y = df_onehot.income_greater_than_50k

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337, stratify=y, shuffle=True)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape {X_test.shape}")

Training set shape: (800, 89)
Test set shape (200, 89)


In [4]:
def evaluate(true_values, predicted_values):
  print(f"accuracy = {accuracy_score(true_values, predicted_values):.3f}")

model = LogisticRegression(solver="liblinear")

model.fit(X_train, y_train)
print(f"***** performance on the test set *****")
evaluate(y_test, model.predict(X_test))

print(f"***** classification report *****")
print(classification_report(y_test, model.predict(X_test)))

***** performance on the test set *****
accuracy = 0.795
***** classification report *****
              precision    recall  f1-score   support

          -1       0.81      0.95      0.87       150
           1       0.70      0.32      0.44        50

    accuracy                           0.80       200
   macro avg       0.75      0.64      0.66       200
weighted avg       0.78      0.80      0.77       200



Let's use cross validation

In [5]:
# model evaluation using cross-validation
k_fold = KFold(n_splits=10, shuffle=True, random_state=1337)
cv = cross_validate(model, X, y, cv=k_fold, scoring=("roc_auc", "accuracy"), return_train_score=True)
pd.DataFrame(cv)
print("***** Evaluate Average Performance on Cross-Validation Set *****")
print("Avg. Test Set Accuracy = {:.3f}".format(np.mean(cv["test_accuracy"])))


***** Evaluate Average Performance on Cross-Validation Set *****
Avg. Test Set Accuracy = 0.822


Let's try to find the best hyperparameter for a model (the code works also for a family of models)

In [6]:
models_hyperparams = {
  "LogisticRegression" : (
    LogisticRegression(solver="liblinear"),
    {"C" : [0.01, 0.05, 0.1, 0.5, 1, 2]}
  )
}

X_dataset, X_test, y_dataset, y_test = train_test_split(X, y, test_size=0.2, random_state=1337, stratify=y)

X_train, X_validation, y_train, y_validation = train_test_split(X_dataset, y_dataset, test_size=0.2, random_state=1337, stratify=y_dataset)

validation_scores = {}
best_validation_score = {}

model = models_hyperparams["LogisticRegression"][0]
hyperparams = models_hyperparams["LogisticRegression"][1]

for hp in hyperparams:
  validation_scores[hp] = {}

  for val in hyperparams[hp]:
    model.set_params(**{hp: val})
    model.fit(X_train, y_train)

    validation_score = accuracy_score(y_validation, model.predict(X_validation))
    validation_scores[hp][val] = validation_score

    if not best_validation_score:
      best_validation_score[hp] = (val, validation_score)
    else:
      if best_validation_score[hp][1] < validation_score:
        best_validation_score[hp] = (val, validation_score)

print("***** Evaluate Performance on Validation Set *****")
print(validation_scores)
print("***** Best Accuracy Score on Validation Set *****")
print(best_validation_score)

# we set the model's hyperparameters to those leading to the best score on the validation test
best_params = dict([(list(best_validation_score.keys())[0], list(best_validation_score.values())[0][0])])
model.set_params(**best_params)

# we fit this model to the whole training set portion
model.fit(X_train, y_train)
print("***** Evaluate Performance on the whole Test Set *****")
evaluate(y_test, model.predict(X_test))

***** Evaluate Performance on Validation Set *****
{'C': {0.01: 0.8, 0.05: 0.8, 0.1: 0.8, 0.5: 0.8, 1: 0.8, 2: 0.8}}
***** Best Accuracy Score on Validation Set *****
{'C': (0.01, 0.8)}
***** Evaluate Performance on the whole Test Set *****
accuracy = 0.795


Let's use cross-validation to find the best hyperparameter

In [7]:
models_and_hyperparams = {
  "LogisticRegression": (
    LogisticRegression(solver="liblinear"),
    {"C": [0.01, 0.05, 0.1, 0.5, 1, 2]},
  )
}

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=1337, stratify=y
)

k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1337)

model = models_and_hyperparams["LogisticRegression"][0]
hyperparams = models_and_hyperparams["LogisticRegression"][1]

gs = GridSearchCV(
  estimator=model,
  param_grid=hyperparams,
  cv=k_fold,
  scoring="accuracy",
  verbose=True,
  return_train_score=True,
)
gs.fit(X_train, y_train)
pd.DataFrame(gs.cv_results_)

print(f"Best hyperparameter: {gs.best_params_}")
print(f"Best accuracy score: {gs.best_score_:.3f}")
evaluate(y_test, gs.predict(X_test))

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best hyperparameter: {'C': 0.05}
Best accuracy score: 0.818
accuracy = 0.795
