In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, log_loss, roc_curve, roc_auc_score, ConfusionMatrixDisplay

In [None]:
# read in mushrooms dataset
mushrooms = pd.read_csv("secondary_data.csv", sep = ";")

In [None]:
# Label encoding the response variable and categorical features
le = LabelEncoder()
mask = mushrooms.isna()
cols_to_encode = mushrooms.columns.drop(["cap-diameter", "stem-width", "stem-height"])

for col in cols_to_encode:
    mushrooms[col] = le.fit_transform(mushrooms[col])

mushrooms = mushrooms.where(~ mask, np.nan)

In [None]:
# remove NaN values from dataset
mushrooms = mushrooms.drop(columns = ["veil-type", "veil-color", "spore-print-color", "stem-root", "stem-surface", "gill-spacing"])
mushrooms = mushrooms[mushrooms["cap-surface"].notnull() & mushrooms["gill-attachment"].notnull() & mushrooms["ring-type"].notnull()]
mushrooms.reset_index(drop = True, inplace = True)

In [None]:
# Split the dataset into X (predictors) and y (response)
X = mushrooms.drop('class', axis=1)
y = mushrooms['class']

In [None]:
# Square root transform numeric features and drop "veil-type" feature
X["stem-height"] = np.sqrt(X["stem-height"])
X["stem-width"] = np.sqrt(X["stem-width"])
X["cap-diameter"] = np.sqrt(X["cap-diameter"])

In [None]:
# Z-transform the numeric features
quantitative_vars = X[["cap-diameter", "stem-width", "stem-height"]]
X.drop(["cap-diameter", "stem-width", "stem-height"], axis=1, inplace=True)
sc = StandardScaler()
sc.fit(quantitative_vars)
x_scaled=sc.transform(quantitative_vars)

quant_scaled=pd.DataFrame(data=x_scaled,columns=["cap-diameter", "stem-width", "stem-height"])
X_scaled = pd.concat([quant_scaled, X], axis = 1)
X_scaled[["cap-surface", "gill-attachment", "ring-type"]] = X_scaled[["cap-surface", "gill-attachment", "ring-type"]].astype(int)

In [None]:
# Train-test split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Running Logistic Regression model

# Initialize logistic regression model
model_lr = LogisticRegression()

# param grid for Logistic Regression
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300, 500]
}
lr_grid = GridSearchCV(model_lr, param_grid, cv=5)
lr_grid.fit(X_train, y_train)
print(lr_grid.best_params_)

# Predict on the test data using the best model
lr_grid_predictions = lr_grid.predict(X_test)

# Calculate model performance metrics
lr_acc = accuracy_score(y_test, lr_grid_predictions)
lr_prec = precision_score(y_test, lr_grid_predictions)
lr_f1 = f1_score(y_test, lr_grid_predictions)
lr_logloss = log_loss(y_test, lr_grid_predictions)

# Print the performance metrics
print("Accuracy:", round(lr_acc, ndigits = 3))
print("Precision:", round(lr_prec, ndigits = 3))
print("F1 Score:", round(lr_f1, ndigits = 3))
print("Log Loss:", round(lr_logloss, ndigits = 3))

In [None]:
# Running Linear Discriminant Analysis model

# Initialize LDA model
model_lda = LinearDiscriminantAnalysis()

# param grid for LDA
param_grid = {
    'solver': ['lsqr', 'eigen'],
    'shrinkage': [None, 'auto'] + list(np.linspace(0, 1, 50)),
    'n_components': [None, 1],
    'store_covariance': [True, False]
}

lda_grid = GridSearchCV(model_lda, param_grid, cv=5)
lda_grid.fit(X_train, y_train)
print(lda_grid.best_params_)

# Predict on the test data using the best model
lda_grid_predictions = lda_grid.predict(X_test)

print('--------------------------------------------------')

# Calculate model performance metrics
lda_acc = accuracy_score(y_test, lda_grid_predictions)
lda_prec = precision_score(y_test, lda_grid_predictions)
lda_f1 = f1_score(y_test, lda_grid_predictions)
lda_logloss = log_loss(y_test, lda_grid_predictions)

# Print the performance metrics
print("Accuracy:", round(lda_acc, ndigits = 3))
print("Precision:", round(lda_prec, ndigits = 3))
print("F1 Score:", round(lda_f1, ndigits = 3))
print("Log Loss:", round(lda_logloss, ndigits = 3))

In [None]:
# Running k-Nearest Neighbours model

# Initialize kNN model
model_knn = KNeighborsClassifier()

# param_grid for k-Nearest Neighbours
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}
knn_grid = GridSearchCV(model_knn, param_grid, cv=5)
knn_grid.fit(X_train, y_train)
print(knn_grid.best_params_)

# Predict the model using the best parameters
knn_grid_predictions = knn_grid.predict(X_test)

print('--------------------------------------------------')

knn_acc = accuracy_score(y_test, knn_grid_predictions)
knn_prec = precision_score(y_test, knn_grid_predictions)
knn_f1 = f1_score(y_test, knn_grid_predictions)
knn_logloss = log_loss(y_test, knn_grid_predictions)

print("Accuracy:", round(knn_acc, ndigits = 3))
print("Precision:", round(knn_prec, ndigits = 3))
print("F1 Score:", round(knn_f1, ndigits = 3))
print("Log Loss:", round(knn_logloss, ndigits = 3))

In [None]:
# Running Random Forest model

# Initialize random forest classifier
model_rf = RandomForestClassifier()

# param_grid for random forest classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [1, 5, 10],
    'min_samples_split': [2, 5, 10]
}
rf_grid = GridSearchCV(model_rf, param_grid, cv=5)
rf_grid.fit(X_train, y_train)
print(rf_grid.best_params_) 

# Predict the model using the best parameters
rf_grid_predictions = rf_grid.predict(X_test)

print('--------------------------------------------------')

rf_acc = accuracy_score(y_test, rf_grid_predictions)
rf_prec = precision_score(y_test, rf_grid_predictions)
rf_f1 = f1_score(y_test, rf_grid_predictions)
rf_logloss = log_loss(y_test, rf_grid_predictions)

print("Accuracy:", round(rf_acc, ndigits = 3))
print("Precision:", round(rf_prec, ndigits = 3))
print("F1 Score:", round(rf_f1, ndigits = 3))
print("Log Loss:", round(rf_logloss, ndigits = 3))

In [None]:
# Running Boosting model

# Initialize boosting classifier
gbc_model = GradientBoostingClassifier()

# param_grid for boosting classifier
param_grid = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': [0.01, 0.5, 1],
    'max_depth': [1, 3, 5],
}
gbc_grid = GridSearchCV(gbc_model, param_grid, cv=5)
gbc_grid.fit(X_train, y_train)
print(gbc_grid.best_params_) 

# Predict the model using the best parameters
gbc_grid_predictions = gbc_grid.predict(X_test)

print('--------------------------------------------------')

gbc_acc = accuracy_score(y_test, gbc_grid_predictions)
gbc_prec = precision_score(y_test, gbc_grid_predictions)
gbc_f1 = f1_score(y_test, gbc_grid_predictions)
gbc_logloss = log_loss(y_test, gbc_grid_predictions)

print("Accuracy:", round(gbc_acc, ndigits = 3))
print("Precision:", round(gbc_prec, ndigits = 3))
print("F1 Score:", round(gbc_f1, ndigits = 3))
print("Log Loss:", round(gbc_logloss, ndigits = 3))

In [None]:
# Confusion Matrices

fig, axes = plt.subplots(3, 2, figsize=(15, 20))

# Logistic Regression
cm_lr = confusion_matrix(y_test, lr_grid_predictions)
disp_lr = ConfusionMatrixDisplay(cm_lr)
disp_lr.plot(ax=axes[0, 0])
axes[0, 0].set_title('Confusion Matrix - Logistic Regression', fontsize = 16)

# LDA
cm_lda = confusion_matrix(y_test, lda_grid_predictions)
disp_lda = ConfusionMatrixDisplay(cm_lda)
disp_lda.plot(ax=axes[0, 1])
axes[0, 1].set_title('Confusion Matrix - Linear Discriminant Analysis', fontsize = 16)

# kNN
cm_knn = confusion_matrix(y_test, knn_grid_predictions)
disp_knn = ConfusionMatrixDisplay(cm_knn)
disp_knn.plot(ax=axes[1, 0])
axes[1, 0].set_title('Confusion Matrix - K Nearest Neighbours', fontsize = 16)

# Random Forest
cm_rf = confusion_matrix(y_test, rf_grid_predictions)
disp_rf = ConfusionMatrixDisplay(cm_rf)
disp_rf.plot(ax=axes[1, 1])
axes[1, 1].set_title('Confusion Matrix - Random Forest', fontsize = 16)

# Boosting
cm_gbc = confusion_matrix(y_test, gbc_grid_predictions)
disp_gbc = ConfusionMatrixDisplay(cm_gbc)
disp_gbc.plot(ax=axes[2, 0])
axes[2, 0].set_title('Confusion Matrix - Boosting', fontsize = 16)

# Hide empty subplot
axes[2, 1].axis('off')
plt.tight_layout()
plt.show()

In [None]:
# ROC curves
lr_grid_probabilities = lr_grid.predict_proba(X_test)[:, 1]
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, lr_grid_probabilities)
roc_auc_lr = roc_auc_score(y_test, lr_grid_probabilities)

lda_grid_probabilities = lda_grid.predict_proba(X_test)[:, 1]
fpr_lda, tpr_lda, thresholds_lda = roc_curve(y_test, lda_grid_probabilities)
roc_auc_lda = roc_auc_score(y_test, lda_grid_probabilities)

knn_grid_probabilities = knn_grid.predict_proba(X_test)[:, 1]
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, knn_grid_probabilities)
roc_auc_knn = roc_auc_score(y_test, knn_grid_probabilities)

rf_grid_probabilities = rf_grid.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf_grid_probabilities)
roc_auc_rf = roc_auc_score(y_test, rf_grid_probabilities)

gbc_grid_probabilities = gbc_grid.predict_proba(X_test)[:, 1]
fpr_gbc, tpr_gbc, thresholds_gbc = roc_curve(y_test, gbc_grid_probabilities)
roc_auc_gbc = roc_auc_score(y_test, gbc_grid_probabilities)

plt.figure(figsize=(14, 18))

# Logistic Regression
plt.subplot(3, 2, 1)
plt.plot(fpr_lr, tpr_lr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc_lr:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression', fontsize = 16)
plt.legend(loc='lower right')

# Linear Discriminant Analysis
plt.subplot(3, 2, 2)
plt.plot(fpr_lda, tpr_lda, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc_lda:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Linear Discriminant Analysis', fontsize = 16)
plt.legend(loc='lower right')

# K Nearest Neighbours
plt.subplot(3, 2, 3)
plt.plot(fpr_knn, tpr_knn, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc_knn:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - K Nearest Neighbours',  fontsize = 16)
plt.legend(loc='lower right')

# Random Forest
plt.subplot(3, 2, 4)
plt.plot(fpr_rf, tpr_rf, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc_rf:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Random Forest',  fontsize = 16)
plt.legend(loc='lower right')

# Boosting
plt.subplot(3, 2, 5)
plt.plot(fpr_gbc, tpr_gbc, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc_gbc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Gradient Boost',  fontsize = 16)
plt.legend(loc='lower right')

# Adjust layout
plt.tight_layout()
plt.show()




In [None]:
# feature importance in random forest and boosting models


feature_importances_rf = rf_grid.best_estimator_.feature_importances_
feature_importance_df_rf = pd.DataFrame({'Feature': X_scaled.columns, 'Importance': feature_importances_rf})
feature_importance_df_rf = feature_importance_df_rf.sort_values(by='Importance', ascending=False).reset_index(drop = True)

feature_importances_gbc = gbc_grid.best_estimator_.feature_importances_
feature_importance_df_gbc = pd.DataFrame({'Feature': X_scaled.columns, 'Importance': feature_importances_gbc})
feature_importance_df_gbc = feature_importance_df_gbc.sort_values(by='Importance', ascending=False).reset_index(drop = True)

fig, ax = plt.subplots(1,2, figsize = (16,8))
ax[0].barh(feature_importance_df_rf['Feature'], feature_importance_df_rf['Importance'])
ax[0].set_xlabel('Importance')
ax[0].set_ylabel(None)
ax[0].set_title('Feature Importance in Random Forest Model', fontsize = 16)
ax[0].invert_yaxis()

ax[1].barh(feature_importance_df_gbc['Feature'], feature_importance_df_gbc['Importance'])
ax[1].set_xlabel('Importance')
ax[1].set_ylabel(None)
ax[1].set_title('Feature Importance in Boosting Model', fontsize = 16)
ax[1].invert_yaxis()

plt.tight_layout()
plt.show()
