In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv('./mushroom-dataset/mushrooms.csv')

# Format attribute values
# ... (code to format attribute values)

# Graph dataset
plt.figure(figsize=(15, 10))
sns.countplot(data=df, x='class')
plt.title('Distribution of Edible vs Poisonous Mushrooms', fontsize=16)
plt.show()

plt.figure(figsize=(15, 10))
sns.pairplot(df, hue='class', markers=['o', 's'], palette='Set2')
plt.show()

# Feature Correlation
plt.figure(figsize=(12, 10))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap', fontsize=16)
plt.show()

# Missing Value Handling
missing_values = df[df == '?'].count()
print(f'Total missing values denoted as "?": {missing_values.sum()}')

# Feature Engineering
df['cap_appearance'] = df['cap-shape'] + '_' + df['cap-color']

# Data Normalization
numeric_cols = ['gill-spacing', 'gill-size', 'ring-number']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Dimensionality Reduction
pca = PCA(n_components=5)
df_pca = pca.fit_transform(df)

# Train-Test Split
X = df.drop('class', axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Evaluation
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier())
]

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label='edible')
    rec = recall_score(y_test, y_pred, pos_label='edible')
    f1 = f1_score(y_test, y_pred, pos_label='edible')
    print(f'{name} Results:')
    print(f'Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1-Score: {f1:.3f}')