Load the dataset and handle missing values

In [7]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.DataFrame(sns.load_dataset("penguins"))

print(f"The shape of the dataset is {df.shape}")
if df.isnull().any().any():
  print(f"There are missing values in the dataset")
else:
  print(f"There are no missing values in the dataset")

from pandas.api.types import is_numeric_dtype
df = df.apply(lambda x: x.fillna(x.median()) if is_numeric_dtype(x) else x.fillna(x.mode().iloc[0]))

print(df.head())

print(df.describe())

The shape of the dataset is (344, 7)
There are missing values in the dataset
  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen           39.10           18.7              181.0   
1  Adelie  Torgersen           39.50           17.4              186.0   
2  Adelie  Torgersen           40.30           18.0              195.0   
3  Adelie  Torgersen           44.45           17.3              197.0   
4  Adelie  Torgersen           36.70           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3       4050.0    Male  
4       3450.0  Female  
       bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g
count      344.000000     344.000000         344.000000   344.000000
mean        43.925000      17.152035         200.892442  4200.872093
std          5.443792       1.969060          14.023826   799.696532
min         32.100000      13.100000         172.000

Encode categorical features with one-hot encoding and format data

In [8]:
categorical_features = [col for col in df.columns if not is_numeric_dtype(df[col]) and col != "sex"]
data_with_dummies = pd.get_dummies(df, columns=categorical_features)

# as a convention, I prefer to place the column to be predicted as the last one
columns = data_with_dummies.columns.tolist()
columns.insert(len(columns), columns.pop(columns.index("sex")))
data_with_dummies = data_with_dummies.loc[:, columns]

df = data_with_dummies

df.sex = df.sex.map(lambda x: 1 if x == "Male" else -1)

print(df.head())

X = df.iloc[:, :-1]
y = df.sex

   bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0           39.10           18.7              181.0       3750.0   
1           39.50           17.4              186.0       3800.0   
2           40.30           18.0              195.0       3250.0   
3           44.45           17.3              197.0       4050.0   
4           36.70           19.3              193.0       3450.0   

   species_Adelie  species_Chinstrap  species_Gentoo  island_Biscoe  \
0            True              False           False          False   
1            True              False           False          False   
2            True              False           False          False   
3            True              False           False          False   
4            True              False           False          False   

   island_Dream  island_Torgersen  sex  
0         False              True    1  
1         False              True   -1  
2         False              True   -1  


train the model with hold-out approach

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.linear_model import Perceptron

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337, stratify=y)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

def evaluate(true_values, predicted_values):
  print(f"accuracy = {accuracy_score(true_values, predicted_values):.3f}")

# test logistic regression
model = LogisticRegression(solver="liblinear", verbose=False)

model.fit(X_train, y_train)

print(f"***** performance on the test set *****")
evaluate(y_test, model.predict(X_test))

print(f"***** classification report *****")
print(classification_report(y_test, model.predict(X_test)))

# test perceptron
model = Perceptron()

model.fit(X_train, y_train)

print(f"***** performance on the test set *****")
evaluate(y_test, model.predict(X_test))

print(f"***** classification report *****")
print(classification_report(y_test, model.predict(X_test)))

Train set shape: (275, 10)
Test set shape: (69, 10)
***** performance on the test set *****
accuracy = 0.841
***** classification report *****
              precision    recall  f1-score   support

          -1       0.87      0.79      0.83        33
           1       0.82      0.89      0.85        36

    accuracy                           0.84        69
   macro avg       0.84      0.84      0.84        69
weighted avg       0.84      0.84      0.84        69

***** performance on the test set *****
accuracy = 0.536
***** classification report *****
              precision    recall  f1-score   support

          -1       1.00      0.03      0.06        33
           1       0.53      1.00      0.69        36

    accuracy                           0.54        69
   macro avg       0.76      0.52      0.38        69
weighted avg       0.75      0.54      0.39        69



let's try to use cross-validation

In [10]:
model = LogisticRegression(solver="liblinear")
cv = cross_validate(model, X, y, cv=10, scoring=("roc_auc", "accuracy"), return_train_score=True)
pd.DataFrame(cv)

# model evaluation using cross-validation
print("***** Evaluate Average Performance on Cross-Validation Set *****")
print("Avg. Test Set Accuracy = {:.3f}".format(np.mean(cv["test_accuracy"])))

model = LogisticRegression(solver = "liblinear")
k_fold = KFold(n_splits=10, shuffle=True, random_state=1337)
cv = cross_validate(model, X, y, cv=k_fold, scoring=("roc_auc", "accuracy"), return_train_score=True)

# model evaluation using cross-validation
print("***** Evaluate Average Performance on Cross-Validation Set *****")
print("Avg. Test Set Accuracy = {:.3f}".format(np.mean(cv["test_accuracy"])))

***** Evaluate Average Performance on Cross-Validation Set *****
Avg. Test Set Accuracy = 0.802
***** Evaluate Average Performance on Cross-Validation Set *****
Avg. Test Set Accuracy = 0.820


Model selection and evaluation

In [11]:
models_and_hyperparams = {
  "LogisticRegression": (
    LogisticRegression(solver="liblinear"),
    {"C": [0.01, 0.05, 0.1, 0.5, 1, 2]},
  )
}


# outer splitting: training vs test set (80/20)
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=73, stratify=y
)

# inner splitting (within the outer training set): training vs validation (80/20)
# training set is used to train the model, validation set is used to select the best hyperparameters
X_train_train, X_validation, y_train_train, y_validation = train_test_split(
  X_train, y_train, test_size=0.2, random_state=1337, stratify=y_train
)

training_scores = {}
validation_scores = {}

best_training_score = {}
best_validation_score = {}

model = models_and_hyperparams["LogisticRegression"][0]
hyperparams = models_and_hyperparams["LogisticRegression"][1]

for hp in hyperparams:
  training_scores[hp] = {}
  validation_scores[hp] = {}
  
  for val in hyperparams[hp]:
    model.set_params(**{hp: val})
    
    model.fit(X_train_train, y_train_train)
    
    training_score = accuracy_score(y_train_train, model.predict(X_train_train))
    training_scores[hp][val] = training_score
    
    validation_score = accuracy_score(y_validation, model.predict(X_validation))
    validation_scores[hp][val] = validation_score
    
    if not best_validation_score:
      best_validation_score[hp] = (val, validation_score)
    else:
      if best_validation_score[hp][1] < validation_score:
        best_validation_score[hp] = (val, validation_score)

print("***** Evaluate Performance on Validation Set *****")
print(validation_scores)
print("***** Best Accuracy Score on Validation Set *****")
print(best_validation_score)

# we set the model's hyperparameters to those leading to the best score on the validation test
best_params = dict([(list(best_validation_score.keys())[0], list(best_validation_score.values())[0][0])])
model.set_params(**best_params)

# we fit this model to the whole training set portion
model.fit(X_train, y_train)
print("***** Evaluate Performance on the whole Test Set *****")
evaluate(y_test, model.predict(X_test))

***** Evaluate Performance on Validation Set *****
{'C': {0.01: 0.8, 0.05: 0.8, 0.1: 0.8, 0.5: 0.8, 1: 0.8, 2: 0.8}}
***** Best Accuracy Score on Validation Set *****
{'C': (0.01, 0.8)}
***** Evaluate Performance on the whole Test Set *****
accuracy = 0.812


In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=1337, stratify=y
)

models = {
  # "LogisticRegression": LogisticRegression(solver="liblinear", max_iter=1000),
  # "LinearSVC": LinearSVC(),
  # "DecisionTreeClassifier": DecisionTreeClassifier(),
  "RandomForestClassifier": RandomForestClassifier(),
  "GradientBoostingClassifier": GradientBoostingClassifier(),
  # "Knn": KNeighborsClassifier(),
  "MLP": MLPClassifier()
  # i could add more models here
}

k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1337)
cv_scores = {}
for model_name, model in models.items():
  cv_scores[model_name] = cross_val_score(model, X_train, y_train, cv=k_fold, scoring="accuracy")

cv_df = pd.DataFrame(cv_scores).transpose()

cv_df['avg_cv'] = np.mean(cv_df, axis=1)
cv_df['std_cv'] = np.std(cv_df, axis=1)
cv_df = cv_df.sort_values(['avg_cv', 'std_cv'], ascending=[False,True])

print(cv_df.head())

# model Selection: Logistic Regression is the best overall method, therefore we pick that!
# now we need to provide an estimate of its generalization performance. 
# to do so, we evaluate it against the test set portion we previously held out.
model = models[cv_df.index[0]]
# re-fit the best selected model on the whole training set
model.fit(X_train, y_train)
# evaluation
print("***** Evaluate Performance on Training Set *****")
evaluate(y_train, model.predict(X_train))
print("***** Evaluate Performance on Test Set *****")
evaluate(y_test, model.predict(X_test))

                                   0         1         2         3         4  \
RandomForestClassifier      0.892857  0.892857  0.857143  0.785714  0.785714   
GradientBoostingClassifier  0.785714  0.928571  0.857143  0.750000  0.750000   
MLP                         0.535714  0.464286  0.535714  0.607143  0.500000   

                                   5         6         7         8         9  \
RandomForestClassifier      0.925926  0.925926  0.925926  0.851852  0.888889   
GradientBoostingClassifier  0.962963  0.925926  0.888889  0.851852  0.777778   
MLP                         0.518519  0.555556  0.666667  0.592593  0.851852   

                              avg_cv    std_cv  
RandomForestClassifier      0.873280  0.048034  
GradientBoostingClassifier  0.847884  0.071076  
MLP                         0.582804  0.100146  
***** Evaluate Performance on Training Set *****
accuracy = 1.000
***** Evaluate Performance on Test Set *****
accuracy = 0.957
