In [20]:
!pip install ucimlrepo
!pip install category_encoders
!pip install xgboost



In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.datasets import make_regression
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint, uniform
from ucimlrepo import fetch_ucirepo
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import category_encoders as ce
import xgboost as xgb
import time


import numpy as np
import pandas as pd

In [22]:
mushroom = fetch_ucirepo(id=73)

X = mushroom.data.features
y = mushroom.data.targets

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
column_names = ["class", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"]
df = pd.read_csv(url, header=None, names=column_names)

print(df)

     class cap-shape cap-surface cap-color bruises odor gill-attachment  \
0        p         x           s         n       t    p               f   
1        e         x           s         y       t    a               f   
2        e         b           s         w       t    l               f   
3        p         x           y         w       t    p               f   
4        e         x           s         g       f    n               f   
...    ...       ...         ...       ...     ...  ...             ...   
8119     e         k           s         n       f    n               a   
8120     e         x           s         n       f    n               a   
8121     e         f           s         n       f    n               a   
8122     p         k           y         n       f    y               f   
8123     e         x           s         n       f    n               a   

     gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0               c         n

In [23]:
#NAIVE BAYES CLASSIFER METHOD
y = df['class'].apply(lambda x: 1 if x == 'p' else 0)

X = df.drop('class', axis=1)

encoder = ce.OrdinalEncoder(cols=X.columns)
X_encoded = encoder.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

clf = CategoricalNB()

start_time = time.time()
clf.fit(X_train, y_train)
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# precision, recall, and F1-score
precision = conf_matrix[1, 1] / (conf_matrix[0, 1] + conf_matrix[1, 1])
recall = conf_matrix[1, 1] / (conf_matrix[1, 0] + conf_matrix[1, 1])
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")

print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.95
Precision: 0.99
Recall: 0.91
F1-Score: 0.95
Confusion Matrix:
[[837   6]
 [ 74 708]]


In [24]:
#RANDOMFOREST CLASSIFER METHOD
#THIS CODE MAY TAKE A MINUTE
X_encoded = pd.get_dummies(df.drop('class', axis=1))
y = df['class'].apply(lambda x: 1 if x == 'p' else 0)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
param_distributions = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 15),
    'min_samples_leaf': randint(1, 10)
}

search = RandomizedSearchCV(clf, param_distributions, n_iter=50, cv=3, random_state=42)

start_time = time.time()
search.fit(X_train2, y_train2)
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

best_model = search.best_estimator_
y_pred2 = best_model.predict(X_test2)
accuracy = accuracy_score(y_test2, y_pred2)
print(f"Test set accuracy: {accuracy:.2f}")

conf_matrix2 = confusion_matrix(y_test2, y_pred2)

# precision, recall, and F1-score
precision = conf_matrix2[1, 1] / (conf_matrix2[0, 1] + conf_matrix2[1, 1])
recall = conf_matrix2[1, 1] / (conf_matrix2[1, 0] + conf_matrix2[1, 1])
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")

print("Confusion Matrix:")
print(conf_matrix2)

importances = best_model.feature_importances_
feature_importances = pd.Series(importances, index=X_encoded.columns).sort_values(ascending=False)
print(feature_importances.head(10))


Execution time: 56.5552818775177 seconds
Test set accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00
Confusion Matrix:
[[843   0]
 [  0 782]]
odor_n                        0.144967
odor_f                        0.083856
gill-size_n                   0.081690
gill-size_b                   0.057996
stalk-surface-below-ring_k    0.048769
stalk-surface-above-ring_k    0.038443
spore-print-color_h           0.038109
bruises_f                     0.034472
gill-color_b                  0.031948
ring-type_p                   0.027848
dtype: float64


In [25]:
#XGBOOST CLASSIFIER METHOD
#THIS MAY TAKE A FEW MINS

y = df['class'].apply(lambda x: 1 if x == 'p' else 0)

X = df.drop('class', axis=1)

encoder = ce.OneHotEncoder(cols=X.columns)
X_encoded = encoder.fit_transform(X)

X_train3, X_test3, y_train3, y_test3 = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_distributions = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'min_child_weight': randint(1, 10)
}

random_search = RandomizedSearchCV(xgb_clf, param_distributions, n_iter=50, scoring='accuracy', cv=5, random_state=42)

start_time = time.time()
random_search.fit(X_train3, y_train3)
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

best_hyperparams = random_search.best_params_
print("Best hyperparameters:")
print(best_hyperparams)

best_model = random_search.best_estimator_
y_pred3 = best_model.predict(X_test3)

accuracy = accuracy_score(y_test3, y_pred3)
print(f"Test set accuracy: {accuracy:.2f}")

conf_matrix3 = confusion_matrix(y_test3, y_pred3)

# precision, recall, and F1-score
precision = conf_matrix3[1, 1] / (conf_matrix3[0, 1] + conf_matrix3[1, 1])
recall = conf_matrix3[1, 1] / (conf_matrix3[1, 0] + conf_matrix3[1, 1])
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")

print("Confusion Matrix:")
print(conf_matrix3)

Execution time: 111.79937100410461 seconds
Best hyperparameters:
{'learning_rate': 0.1890550473839461, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 264}
Test set accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00
Confusion Matrix:
[[843   0]
 [  0 782]]
