<center><h1 style="font-size:48px"> 🍄 Mushroom Classification 🍄‍🟫 </h1>
<img src="https://i.giphy.com/media/v1.Y2lkPTc5MGI3NjExMDg3ODMzZmJtaWYzcWdmdGpjejh2cGVmcnhvODljcXFiNzZvam9vNSZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/yh3E9kaUHjKaXtF9v1/giphy.gif"></center>


# Feature Description
<div class="alert alert-block alert-info" style="font-family: verdana; font-size: 16px; line-height: 1.7;">
    <ol>
        <li><p><strong>Cap Diameter</strong>: The measurement of the cap across its widest point. It helps in identifying the mushroom’s size and can range from a few millimeters to several centimeters.</p>
        </li>
        <li><p><strong>Cap Shape</strong>: The overall shape of the cap, such as conical, bell-shaped, flat, or wavy. This characteristic helps in distinguishing different species.</p>
        </li>
        <li><p><strong>Cap Surface</strong>: The texture and appearance of the cap’s surface. It can be smooth, scaly, sticky, or wrinkled, providing clues about the mushroom’s identity.</p>
        </li>
        <li><p><strong>Cap Color</strong>: The color of the cap, which can vary widely and may change as the mushroom matures. Color can be a key factor in identifying species.</p>
        </li>
        <li><p><strong>Does Bruise or Bleed</strong>: Refers to whether the mushroom changes color when bruised or if it releases a colored liquid. This reaction can be important for identification.</p>
        </li>
        <li><p><strong>Gill Attachment</strong>: How the gills are attached to the stem. They can be free (not attached), attached (connected to the stem), or descending (extending down the stem).</p>
        </li>
        <li><p><strong>Gill Spacing</strong>: The distance between gills. Gills can be crowded, spaced, or intermediate in their spacing.</p>
        </li>
        <li><p><strong>Gill Color</strong>: The color of the gills, which can help in distinguishing species and may change with age.</p>
        </li>
        <li><p><strong>Stem Height</strong>: The length of the stem from the ground to where the cap is attached. Variations in stem height can aid in identification.</p>
        </li>
        <li><p><strong>Stem Width</strong>: The diameter of the stem. It can be narrow, medium, or wide, and varies among species.</p>
        </li>
        <li><p><strong>Stem Root</strong>: The base of the stem, which can be swollen, bulbous, or tapering. Observing the stem’s base can help identify certain mushrooms.</p>
        </li>
        <li><p><strong>Stem Surface</strong>: The texture and appearance of the stem’s surface. It can be smooth, fibrous, scaly, or rough.</p>
        </li>
        <li><p><strong>Stem Color</strong>: The color of the stem, which might be uniform or vary along its length.</p>
        </li>
        <li><p><strong>Veil Type</strong>: Refers to the type of veil present on the mushroom, such as a partial veil (covering the gills and often forming a ring) or a universal veil (enveloping the entire mushroom in its early stages).</p>
        </li>
        <li><p><strong>Veil Color</strong>: The color of the veil. It can be a key characteristic for identifying mushrooms, especially in distinguishing between species with similar appearances.</p>
        </li>
        <li><p><strong>Has Ring</strong>: Indicates whether the mushroom has a ring (also called an annulus) around the stem, which is a remnant of the partial veil.</p>
        </li>
        <li><p><strong>Ring Type</strong>: Describes the type of ring present, such as a single ring, double ring, or a ring that is flaring or hanging.</p>
        </li>
        <li><p><strong>Spore Print Color</strong>: The color of the spore print obtained by placing the cap on a white sheet of paper. This is a critical identifying feature.</p>
        </li>
        <li><p><strong>Habitat</strong>: The environment where the mushroom is found, such as woodlands, grasslands, or urban areas. The habitat can help narrow down the possible species.</p>
        </li>
        <li><p><strong>Season</strong>: The time of year when the mushroom appears. Different species fruit in different seasons, which helps in identification.</p>
        </li>
    </ol>
</div>

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, cross_validate
from sklearn.metrics import matthews_corrcoef, f1_score, confusion_matrix, ConfusionMatrixDisplay, make_scorer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import mode

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Loading Data

In [2]:
train_df = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv", index_col='id')
orig_df = pd.read_csv("/kaggle/input/secondary-mushroom-dataset-data-set/MushroomDataset/secondary_data.csv", sep=";")
test_df = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv", index_col='id')

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/playground-series-s4e8/train.csv'

In [None]:
train_df = pd.concat([train_df, orig_df], ignore_index=True)

In [None]:
train_df.head()

In [None]:
test_df.head()

# EDA

In [None]:
train_df.info()

In [None]:
pd.DataFrame({
    'column': train_df.columns,
    'null-count': train_df.isna().sum().values,
    '% null-count': np.round(train_df.isna().sum().values*100/len(train_df),6)
}).sort_values(by='null-count', ascending=False).reset_index(drop=True)

In [None]:
train_df.describe().T

In [None]:
train_df.describe(include='O').T

In [None]:
train_df.duplicated().sum()

In [None]:
train_df.drop_duplicates(inplace=True)

In [None]:
target = 'class'

In [None]:
features = train_df.drop(target, axis=1).columns.to_list()
features

In [None]:
features_with_high_null_values = [feature for feature in features if (train_df[feature].isna().sum()/len(train_df)*100)>20]
features_with_high_null_values

In [None]:
categorical_features = train_df[features].select_dtypes(include='object').columns.to_list()
categorical_features

In [None]:
numerical_features = list(set(features) - set(categorical_features))
numerical_features

In [None]:
pd.concat([train_df, test_df])[categorical_features].describe(include='O').T

## Distribution of Categorical Features

In [None]:
for i, col in enumerate(categorical_features):
    plt.figure(figsize=(8, 6))
    
    fil_data = train_df[col].value_counts()
    fil_cat = fil_data[fil_data>=100].index
    fil_df = train_df[train_df[col].isin(fil_cat)]
    
    sns.countplot(x=col, hue=target, data=fil_df)
    
    plt.title(f"Count Plot of {col}", size=20)
    plt.show()

In [None]:
plt.figure(figsize=(8, 15))
for i, col in enumerate(numerical_features):
    plt.subplot(3, 1, i+1)
    sns.histplot(x=col, hue=target, data=train_df, kde=True, bins=20)
    plt.title(f"Histogram of {col}", size=20)
    sns.despine()
plt.tight_layout()
plt.suptitle("Distribution of Numerical Features", y=1.05, size=28)
plt.show()

In [None]:
plt.figure(figsize=(8, 15))
for i, col in enumerate(numerical_features):
    plt.subplot(3, 1, i+1)
    sns.boxplot(x=col, y=target, hue=target, data=train_df)
    plt.title(f"Boxplot of {col}", size=20)
plt.tight_layout()
plt.suptitle("Boxplot of Numerical Features", y=1.05, size=28)
plt.show()

In [None]:
plt.figure(figsize=(8, 15))
for i, col in enumerate(numerical_features):
    plt.subplot(3, 1, i+1)
    sns.violinplot(x=target, y=col, hue=target, data=train_df)
    plt.title(f"Violin Plot of {col}", size=20)
plt.tight_layout()
plt.suptitle("Violin Plots of Numerical Features", y=1.05, size=28)
plt.show()

## Target Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x=target, data=train_df)
plt.title("Target Distribution", size=28)
plt.plot()

# Imputing Null Values

In [None]:
def cleaner(df):
    for col in categorical_features:
        df[col] = df[col].fillna('missing')
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < 100, col] = "noise"
        df[col] = df[col].astype('category')

    return df

In [None]:
train_df = cleaner(train_df)
test_df = cleaner(test_df)

In [None]:
cap_diameter_mean = pd.concat([train_df['cap-diameter'], test_df['cap-diameter']]).mean(numeric_only=True)
train_df['cap-diameter'].fillna(cap_diameter_mean, inplace=True)
test_df['cap-diameter'].fillna(cap_diameter_mean, inplace=True)

# Model Training

In [None]:
X = train_df.copy()
y = X.pop(target)

lab_enc = LabelEncoder().fit(y)
y = lab_enc.transform(y)

In [None]:
def model_report(estimator, X, y, cv=5):
    print("="*80)
    print(f"    Model: {estimator.__class__.__name__}")
    print("="*80)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1/cv, shuffle=True, stratify=y, random_state=42)
    
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print(f"F1 Score : {f1.mean():.6f}")
    print(f"MCC Score: {mcc.mean():.6f}")
    
    ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot()
    plt.title("Confusion Matrix")
    plt.show()

    print()

In [None]:
def model_trainer(model, X, y, n_splits=5, random_state=42):
    skfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    oof_probs, oof_mccs = [], []
    print("="*80)
    print(f"Training {model.__class__.__name__}")
    print("="*80, end="\n")
    for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
        X_train, y_train = X.iloc[train_idx, :], y[train_idx]
        X_test, y_test = X.iloc[test_idx, :], y[test_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mcc = matthews_corrcoef(y_pred, y_test)
        oof_mccs.append(mcc)
        oof_probs.append(model.predict_proba(test_df))
        print(f"--- Fold {fold+1} MCC Score: {mcc:.6f}")
    print(f"\n---> Mean MCC Score: {np.mean(oof_mccs):.6f} \xb1 {np.std(oof_mccs):.6f}\n\n")
    return oof_probs, oof_mccs

## Baseline Models

In [None]:
xgb_clf = XGBClassifier(enable_categorical=True, device="cuda", tree_method="hist")

model_report(xgb_clf, X, y)

In [None]:
cat_clf = CatBoostClassifier(
    cat_features=categorical_features,
    verbose=False,
    allow_writing_files=False,
    task_type="GPU"
)

model_report(cat_clf, X, y)

In [None]:
lgb_clf = LGBMClassifier(device='gpu', verbosity=-1)

model_report(lgb_clf, X, y)

## Hyperparameter Tuning

In [None]:
# import optuna
# from optuna.samplers import TPESampler

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, shuffle=True, stratify=y, random_state=101)

In [None]:
# def objective(trial):
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 2000, 3000),
#         "eta": trial.suggest_float("eta", 1e-3, 1e-2),
#         "gamma": trial.suggest_float("gamma", 0, 5.0),
#         "max_depth": trial.suggest_int("max_depth", 2, 32),
#         "min_child_weight": trial.suggest_int("min_child_weight", 40, 100),
#         "subsample": trial.suggest_float("subsample", 0.1, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
#         "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
#         "max_leaves": trial.suggest_int("max_leaves", 16, 84)
#     }
#     params['device'] = 'cuda'
#     params['tree_method'] = 'hist'
#     params['enable_categorical'] = True
    
#     model = XGBClassifier(**params)
#     model.fit(X_train, y_train)
    
#     y_pred = model.predict(X_test)
#     return matthews_corrcoef(y_test, y_pred)


# study_name = "xgb"
# storage = "sqlite:///xgb.db"

# study = optuna.create_study(storage=storage,
#                             study_name=study_name,
#                             direction="maximize",
# #                             sampler=TPESampler(),
#                             load_if_exists=True)

# study.optimize(objective, n_trials=100)

# print(study.best_params)


In [None]:
# def objective(trial):
#     params = {
#         "iterations": trial.suggest_int("iterations", 100, 3000),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         "depth": trial.suggest_int("depth", 4, 10),
#         "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
#         "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
#         "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
#         "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
#         "od_wait": trial.suggest_int("od_wait", 10, 50),
#         "verbose": False,
#         "allow_writing_files": False,
#         "task_type": 'GPU',
#         "cat_features": categorical_features
#     }

#     model = CatBoostClassifier(**params)

#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return matthews_corrcoef(y_test, y_pred)


# study_name = "cat"
# storage = "sqlite:///cat.db"

# study = optuna.create_study(storage=storage,
#                             study_name=study_name,
#                             direction="maximize",
#                             sampler=TPESampler(n_startup_trials=20, multivariate=True),
#                             load_if_exists=True)

# study.optimize(objective, n_trials=50)

# print(study.best_params)

In [None]:
# def objective(trial):
#     params = {
#         "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
#         "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
#         "num_leaves": trial.suggest_int("num_leaves", 2, 256),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
#         "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
#         "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
#         "device": 'gpu',
#         "verbosity": -1
#     }

#     model = LGBMClassifier(**params)

#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return matthews_corrcoef(y_test, y_pred)


# study_name = "lgb"
# storage = "sqlite:///lgb.db"

# study = optuna.create_study(storage=storage,
#                             study_name=study_name,
#                             direction="maximize",
#                             sampler=TPESampler(n_startup_trials=20, multivariate=True),
#                             load_if_exists=True)

# study.optimize(objective, n_trials=100)

# print(study.best_params)

In [None]:
xgb_params = {
    'n_estimators': 2407,
    'eta': 0.009462133032592785,
    'gamma': 0.2865859948765318,
    'max_depth': 31,
    'min_child_weight': 47,
    'subsample': 0.6956431754146083,
    'colsample_bytree': 0.3670732604094118,
    'grow_policy': 'lossguide',
    'max_leaves': 73,
    'enable_categorical': True,
    'n_jobs': -1,
    'device': 'cuda',
    'tree_method': 'hist'
} # 0.9844272567086021

cat_params = {
    'iterations': 1041,
    'learning_rate': 0.08777255350163136,
    'depth': 10,
    'l2_leaf_reg': 0.1259643500248322,
    'bootstrap_type': 'Bayesian',
    'random_strength': 4.276181166674371e-08,
    'bagging_temperature': 0.35995482350907326,
    'od_type': 'Iter',
    'od_wait': 39,
    "verbose": False,
    "allow_writing_files": False,
    "task_type": 'GPU',
    "cat_features": categorical_features
} # 0.9841773055825763

lgb_params = {
    'n_estimators': 2500,
    'random_state':42,
    'max_bin':1024,
    'colsample_bytree':0.6,
    'reg_lambda': 80,
#     'device': 'gpu',
    'verbosity': -1
}

## Prediction

In [None]:
oof_probs = {}
# oof_probs['xgb'], _ = model_trainer(XGBClassifier(**xgb_params), X, y, random_state=101)
# oof_probs['cat'], _ = model_trainer(CatBoostClassifier(**cat_params), X, y, random_state=101)
oof_probs['lgb'], _ = model_trainer(LGBMClassifier(**lgb_params), X, y, random_state=101)

In [None]:
oof_preds = {}
for model in oof_probs.keys():
    oof_preds[model] = np.argmax(np.mean(oof_probs[model], axis=0), axis=1)

# Submission

In [None]:
sub = pd.read_csv("/kaggle/input/playground-series-s4e8/sample_submission.csv")
preds = [pred for model, pred in oof_preds.items()]
md = mode(preds, axis=0)[0] if len(preds)>1 else preds[0]
sub[target] = lab_enc.inverse_transform(md)
sub.to_csv("submission.csv", index=False)

In [None]:
ext1 = pd.read_csv("/kaggle/input/mario-s-nightmare-15-th-place-solution/submission.csv")[target].ravel()
ext2 = pd.read_csv("/kaggle/input/ps4e8-binary-class-mathews-correlation-coeff/submission.csv")[target].ravel()
ext3 = pd.read_csv("/kaggle/input/playgrounds4e08-modeblend/submission.csv")[target].ravel()
ext4 = pd.read_csv("/kaggle/input/autogloun-t8-dslanders/submission.csv")[target].ravel()
ext5 = pd.read_csv("/kaggle/input/mario-s-nightmare-denselight-0-990/submission_test7.csv")[target].ravel()

In [None]:
preds = [ext1, ext2, ext3, ext4, ext5]

preds = [lab_enc.transform(x) for x in preds]

In [None]:
md = mode(preds, axis=0)[0]

In [None]:
sub[target] = lab_enc.inverse_transform(md)
sub.to_csv("submission.csv", index=False)