# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, cross_validate
from sklearn.metrics import matthews_corrcoef, f1_score, confusion_matrix, ConfusionMatrixDisplay, make_scorer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import mode

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

# Loading Data

In [3]:
train_df = pd.read_csv("/Users/no.2/Desktop/mush/playground-series-s4e8/train.csv", index_col='id')
orig_df = pd.read_csv("/Users/no.2/Desktop/mush/playground-series-s4e8/secondary_data.csv", sep=";")
test_df = pd.read_csv("/Users/no.2/Desktop/mush/playground-series-s4e8/test.csv", index_col='id')

In [5]:
cat_feature = orig_df.select_dtypes(exclude=np.number).columns.tolist()
cat_dic = {}
for i in cat_feature:
    cat_dic[i] = orig_df[i].unique()

In [9]:
from tqdm import tqdm
for column, valid_values in tqdm(cat_dic.items()):
    if column in train_df.columns:
        train_df[column] = train_df[column].apply(lambda x: x if x in valid_values else np.nan)

100%|██████████| 18/18 [01:31<00:00,  5.06s/it]


In [10]:
for i in cat_feature:
    print(train_df[i].value_counts())

class
p    1705396
e    1411549
Name: count, dtype: int64
cap-shape
x    1436026
f     676238
s     365146
b     318646
o     108835
p     106967
c     104520
Name: count, dtype: int64
cap-surface
t    460777
s    384970
y    327826
h    284460
g    263729
d    206832
k    128875
e    119712
i    113440
w    109840
l     44859
Name: count, dtype: int64
cap-color
n    1359542
y     386627
w     379442
g     210825
e     197290
o     178847
p      91838
r      78236
u      73172
b      61313
k      59888
l      39558
Name: count, dtype: int64
does-bruise-or-bleed
f    2569743
t     547085
Name: count, dtype: int64
gill-attachment
a    646034
d    589236
x    360878
e    301858
s    295439
p    279110
f    119953
Name: count, dtype: int64
gill-spacing
c    1331054
d     407932
f     119380
Name: count, dtype: int64
gill-color
w    931538
n    543386
y    469464
p    343626
g    212164
o    157119
k    127970
f    119694
r     62799
e     56047
b     47247
u     45399
Name: count, dtype: i

In [12]:
train_df = pd.concat([train_df, orig_df], ignore_index=True)

In [13]:
test_df.head()

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3116945,8.64,x,,n,t,,,w,11.13,17.12,b,,w,u,w,t,g,,d,a
3116946,6.9,o,t,o,f,,c,y,1.27,10.75,,,n,,,f,f,,d,a
3116947,2.0,b,g,n,f,,c,n,6.18,3.14,,,n,,,f,f,,d,s
3116948,3.47,x,t,n,f,s,c,n,4.98,8.51,,,w,,n,t,z,,d,u
3116949,6.17,x,h,y,f,p,,y,6.73,13.7,,,y,,y,t,,,d,u


# EDA

In [None]:
train_df.info()

In [None]:
pd.DataFrame({
    'column': train_df.columns,
    'null-count': train_df.isna().sum().values,
    '% null-count': np.round(train_df.isna().sum().values*100/len(train_df),6)
}).sort_values(by='null-count', ascending=False).reset_index(drop=True)

In [None]:
train_df.describe().T

In [None]:
train_df.describe(include='O').T

In [None]:
train_df.duplicated().sum()

In [None]:
train_df.drop_duplicates(inplace=True)

In [14]:
target = 'class'

In [15]:
features = train_df.drop(target, axis=1).columns.to_list()
features

['cap-diameter',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-height',
 'stem-width',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [16]:
features_with_high_null_values = [feature for feature in features if (train_df[feature].isna().sum()/len(train_df)*100)>20]
features_with_high_null_values

['cap-surface',
 'gill-spacing',
 'stem-root',
 'stem-surface',
 'veil-type',
 'veil-color',
 'spore-print-color']

In [23]:
categorical_features = train_df[features].select_dtypes(include='object').columns.to_list()
categorical_features

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [24]:
numerical_features = list(set(features) - set(categorical_features))
numerical_features

['stem-width', 'cap-diameter', 'stem-height']

In [None]:
pd.concat([train_df, test_df])[categorical_features].describe(include='O').T

Unnamed: 0,count,unique,top,freq
cap-shape,5255380,62,x,2420909
cap-surface,4123329,59,t,775825
cap-color,5255598,57,n,2288067
does-bruise-or-bleed,5255851,22,f,4333884
gill-attachment,4371836,66,a,1089692
gill-spacing,3132741,35,c,2242740
gill-color,5255437,56,w,1570832
stem-root,609222,31,b,279559
stem-surface,1915109,54,s,552131
stem-color,5255661,55,w,2016928


## Distribution of Categorical Features

In [None]:
for i, col in enumerate(categorical_features):
    plt.figure(figsize=(8, 6))

    fil_data = train_df[col].value_counts()
    fil_cat = fil_data[fil_data>=100].index
    fil_df = train_df[train_df[col].isin(fil_cat)]

    sns.countplot(x=col, hue=target, data=fil_df)

    plt.title(f"Count Plot of {col}", size=20)
    plt.show()

In [None]:
plt.figure(figsize=(8, 15))
for i, col in enumerate(numerical_features):
    plt.subplot(3, 1, i+1)
    sns.histplot(x=col, hue=target, data=train_df, kde=True, bins=20)
    plt.title(f"Histogram of {col}", size=20)
    sns.despine()
plt.tight_layout()
plt.suptitle("Distribution of Numerical Features", y=1.05, size=28)
plt.show()

In [None]:
plt.figure(figsize=(8, 15))
for i, col in enumerate(numerical_features):
    plt.subplot(3, 1, i+1)
    sns.boxplot(x=col, y=target, hue=target, data=train_df)
    plt.title(f"Boxplot of {col}", size=20)
plt.tight_layout()
plt.suptitle("Boxplot of Numerical Features", y=1.05, size=28)
plt.show()

In [None]:
plt.figure(figsize=(8, 15))
for i, col in enumerate(numerical_features):
    plt.subplot(3, 1, i+1)
    sns.violinplot(x=target, y=col, hue=target, data=train_df)
    plt.title(f"Violin Plot of {col}", size=20)
plt.tight_layout()
plt.suptitle("Violin Plots of Numerical Features", y=1.05, size=28)
plt.show()

## Target Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x=target, data=train_df)
plt.title("Target Distribution", size=28)
plt.plot()

# Imputing Null Values

In [25]:
def cleaner(df):
    for col in categorical_features:
        df[col] = df[col].fillna('missing')
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < 100, col] = "noise"
        df[col] = df[col].astype('category')

    return df

In [26]:
train_df = cleaner(train_df)
test_df = cleaner(test_df)

In [27]:
cap_diameter_mean = pd.concat([train_df['cap-diameter'], test_df['cap-diameter']]).mean(numeric_only=True)
train_df['cap-diameter'].fillna(cap_diameter_mean, inplace=True)
test_df['cap-diameter'].fillna(cap_diameter_mean, inplace=True)

# Model Training

In [28]:
X = train_df.copy()
y = X.pop(target)

lab_enc = LabelEncoder().fit(y)
y = lab_enc.transform(y)

In [29]:
def model_report(estimator, X, y, cv=5):
    print("="*80)
    print(f"    Model: {estimator.__class__.__name__}")
    print("="*80)

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=1/cv, shuffle=True, stratify=y, random_state=42)

    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    print(f"F1 Score : {f1.mean():.6f}")
    print(f"MCC Score: {mcc.mean():.6f}")

    ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot()
    plt.title("Confusion Matrix")
    plt.show()

    print()

In [30]:
def model_trainer(model, X, y, n_splits=5, random_state=42):
    skfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    oof_probs, oof_mccs = [], []
    print("="*80)
    print(f"Training {model.__class__.__name__}")
    print("="*80, end="\n")
    for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
        X_train, y_train = X.iloc[train_idx, :], y[train_idx] # train 분할
        X_test, y_test = X.iloc[test_idx, :], y[test_idx] # validation 분할

        model.fit(X_train, y_train) # train 학습
        y_pred = model.predict(X_test) # validaton 예측

        mcc = matthews_corrcoef(y_pred, y_test) # validation 성능
        oof_mccs.append(mcc)
        oof_probs.append(model.predict_proba(test_df)) # test 예측
        print(f"--- Fold {fold+1} MCC Score: {mcc:.6f}")
    print(f"\n---> Mean MCC Score: {np.mean(oof_mccs):.6f} \xb1 {np.std(oof_mccs):.6f}\n\n")
    return oof_probs, oof_mccs

## Baseline Models

In [None]:
xgb_clf = XGBClassifier(enable_categorical=True, device="cuda", tree_method="hist")

model_report(xgb_clf, X, y)

In [None]:
cat_clf = CatBoostClassifier(
    cat_features=categorical_features,
    verbose=False,
    allow_writing_files=False,
    task_type="GPU"
)

model_report(cat_clf, X, y)

In [None]:
lgb_clf = LGBMClassifier(device='gpu', verbosity=-1)

model_report(lgb_clf, X, y)

## Hyperparameter Tuning

In [None]:
# import optuna
# from optuna.samplers import TPESampler

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, shuffle=True, stratify=y, random_state=101)

In [None]:
# def objective(trial):
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 2000, 3000),
#         "eta": trial.suggest_float("eta", 1e-3, 1e-2),
#         "gamma": trial.suggest_float("gamma", 0, 5.0),
#         "max_depth": trial.suggest_int("max_depth", 2, 32),
#         "min_child_weight": trial.suggest_int("min_child_weight", 40, 100),
#         "subsample": trial.suggest_float("subsample", 0.1, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
#         "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
#         "max_leaves": trial.suggest_int("max_leaves", 16, 84)
#     }
#     params['device'] = 'cuda'
#     params['tree_method'] = 'hist'
#     params['enable_categorical'] = True

#     model = XGBClassifier(**params)
#     model.fit(X_train, y_train)

#     y_pred = model.predict(X_test)
#     return matthews_corrcoef(y_test, y_pred)


# study_name = "xgb"
# storage = "sqlite:///xgb.db"

# study = optuna.create_study(storage=storage,
#                             study_name=study_name,
#                             direction="maximize",
# #                             sampler=TPESampler(),
#                             load_if_exists=True)

# study.optimize(objective, n_trials=100)

# print(study.best_params)


In [None]:
# def objective(trial):
#     params = {
#         "iterations": trial.suggest_int("iterations", 100, 3000),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         "depth": trial.suggest_int("depth", 4, 10),
#         "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
#         "random_strength": trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
#         "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
#         "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
#         "od_wait": trial.suggest_int("od_wait", 10, 50),
#         "verbose": False,
#         "allow_writing_files": False,
#         "task_type": 'GPU',
#         "cat_features": categorical_features
#     }

#     model = CatBoostClassifier(**params)

#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return matthews_corrcoef(y_test, y_pred)


# study_name = "cat"
# storage = "sqlite:///cat.db"

# study = optuna.create_study(storage=storage,
#                             study_name=study_name,
#                             direction="maximize",
#                             sampler=TPESampler(n_startup_trials=20, multivariate=True),
#                             load_if_exists=True)

# study.optimize(objective, n_trials=50)

# print(study.best_params)

In [None]:
# def objective(trial):
#     params = {
#         "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
#         "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
#         "num_leaves": trial.suggest_int("num_leaves", 2, 256),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
#         "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
#         "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
#         "device": 'gpu',
#         "verbosity": -1
#     }

#     model = LGBMClassifier(**params)

#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return matthews_corrcoef(y_test, y_pred)


# study_name = "lgb"
# storage = "sqlite:///lgb.db"

# study = optuna.create_study(storage=storage,
#                             study_name=study_name,
#                             direction="maximize",
#                             sampler=TPESampler(n_startup_trials=20, multivariate=True),
#                             load_if_exists=True)

# study.optimize(objective, n_trials=100)

# print(study.best_params)

In [31]:
xgb_params = {
    'n_estimators': 2407,
    'eta': 0.009462133032592785,
    'gamma': 0.2865859948765318,
    'max_depth': 31,
    'min_child_weight': 47,
    'subsample': 0.6956431754146083,
    'colsample_bytree': 0.3670732604094118,
    'grow_policy': 'lossguide',
    'max_leaves': 73,
    'enable_categorical': True,
    'n_jobs': -1,
    'device': 'cuda',
    'tree_method': 'hist'
} # 0.9844272567086021

cat_params = {
    'iterations': 1041,
    'learning_rate': 0.08777255350163136,
    'depth': 10,
    'l2_leaf_reg': 0.1259643500248322,
    'bootstrap_type': 'Bayesian',
    'random_strength': 4.276181166674371e-08,
    'bagging_temperature': 0.35995482350907326,
    'od_type': 'Iter',
    'od_wait': 39,
    "verbose": False,
    "allow_writing_files": False,
    "task_type": 'GPU',
    "cat_features": categorical_features
} # 0.9841773055825763

lgb_params = {
    'n_estimators': 2500,
    'random_state':42,
    'max_bin':1024,
    'colsample_bytree':0.6,
    'reg_lambda': 80,
#     'device': 'gpu',
    'verbosity': -1
}

## Prediction

In [32]:
oof_probs = {}
# oof_probs['xgb'], _ = model_trainer(XGBClassifier(**xgb_params), X, y, random_state=101)
# oof_probs['cat'], _ = model_trainer(CatBoostClassifier(**cat_params), X, y, random_state=101)
oof_probs['lgb'], _ = model_trainer(LGBMClassifier(**lgb_params), X, y, random_state=101)

Training LGBMClassifier
--- Fold 1 MCC Score: 0.984924
--- Fold 2 MCC Score: 0.985051
--- Fold 3 MCC Score: 0.985136
--- Fold 4 MCC Score: 0.985082
--- Fold 5 MCC Score: 0.984670

---> Mean MCC Score: 0.984973 ± 0.000167




In [49]:
oof_preds = {}
for model in oof_probs.keys():
    oof_preds[model] = np.argmax(np.mean(oof_probs[model], axis=0), axis=1)

# Submission

In [35]:
test_df

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3116945,8.64,x,missing,n,t,missing,missing,w,11.13,17.12,b,missing,w,u,w,t,g,missing,d,a
3116946,6.90,o,t,o,f,missing,c,y,1.27,10.75,missing,missing,n,missing,missing,f,f,missing,d,a
3116947,2.00,b,g,n,f,missing,c,n,6.18,3.14,missing,missing,n,missing,missing,f,f,missing,d,s
3116948,3.47,x,t,n,f,s,c,n,4.98,8.51,missing,missing,w,missing,n,t,z,missing,d,u
3116949,6.17,x,h,y,f,p,missing,y,6.73,13.70,missing,missing,y,missing,y,t,missing,missing,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5194904,0.88,x,g,w,f,a,d,w,2.67,1.35,missing,missing,e,missing,missing,f,f,missing,d,u
5194905,3.12,x,s,w,f,d,c,w,2.69,7.38,missing,missing,w,missing,missing,f,f,missing,g,a
5194906,5.73,x,e,e,f,a,missing,w,6.16,9.74,missing,missing,y,missing,w,t,z,missing,d,a
5194907,5.03,b,g,n,f,a,d,g,6.00,3.46,missing,s,g,missing,missing,f,f,missing,d,a


In [37]:
sub = pd.read_csv("/Users/no.2/Desktop/mush/playground-series-s4e8/sample_submission.csv")

Unnamed: 0,id,class
0,3116945,e
1,3116946,e
2,3116947,e
3,3116948,e
4,3116949,e


In [38]:
# 전체 데이터로 학습, test 예측
model = LGBMClassifier(**lgb_params)
model.fit(X, y)
pred = model.predict(test_df)
print('done!')


done!


In [40]:
# target 변환 및 csv 생성
sub[target] = lab_enc.inverse_transform(pred)
sub.to_csv("submission.csv", index=False)

In [50]:
sub = pd.read_csv("/Users/no.2/Desktop/mush/playground-series-s4e8/sample_submission.csv")
preds = [pred for model, pred in oof_preds.items()]
md = mode(preds, axis=0)[0] if len(preds)>1 else preds[0]
sub[target] = lab_enc.inverse_transform(md)
sub.to_csv("submission_2.csv", index=False)

In [51]:
preds

[array([0, 1, 1, ..., 1, 0, 0])]