In [None]:
# Holy grail
import pandas as pd
import numpy as np

# Sklearn
from sklearn.preprocessing import StandardScaler, QuantileTransformer, OrdinalEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score # evaluation metric used for leaderboard scoring in this competition

# Statistics
import scipy.stats as stats

# Visualization
from matplotlib import pyplot as plt # pyplot is an easy to use scripting interface for plotting as oppose to more advanced artistic interface
import seaborn as sns # seaborn is even higher level graphing library built on top of matplotlib

# Machine learning
import optuna # used for finding good hyperparameters for a model
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv', index_col='id')

In [None]:
train.info()

In [None]:
train.isna().sum().sort_values()

In [None]:
num = train.select_dtypes(include=['int64','float64']).columns

df = pd.concat([train[num].assign(Source = 'Train'), test[num].assign(Source = 'Test')], ignore_index = True)

# Use of more advanced artistic matplotlib interface (see the axes)
fig, axes = plt.subplots(len(num), 3 ,figsize = (16, len(num) * 4), gridspec_kw = {'hspace': 0.35, 'wspace': 0.3, 'width_ratios': [0.80, 0.20, 0.20]})

for i,col in enumerate(num):
    ax = axes[i,0]
    sns.kdeplot(data = df[[col, 'Source']], x = col, hue = 'Source', palette=['#456cf0', '#ed7647'], linewidth = 2.1, warn_singular=False, ax = ax) # Use of seaborn with artistic interface
    ax.set_title(f"\n{col}",fontsize = 9)
    ax.grid(visible=True, which = 'both', linestyle = '--', color='lightgrey', linewidth = 0.75)
    ax.set(xlabel = '', ylabel = '')

    ax = axes[i,1]
    sns.boxplot(data = df.loc[df.Source == 'Train', [col]], y = col, width = 0.25, linewidth = 0.90, fliersize= 2.25, color = '#456cf0', ax = ax)
    ax.set(xlabel = '', ylabel = '')
    ax.set_title("Train", fontsize = 9)

    ax = axes[i,2]
    sns.boxplot(data = df.loc[df.Source == 'Test', [col]], y = col, width = 0.25, linewidth = 0.90, fliersize= 2.25, color = '#ed7647', ax = ax)
    ax.set(xlabel = '', ylabel = '')
    ax.set_title("Test", fontsize = 9)

plt.suptitle(f'\nDistribution analysis - numerical features\n',fontsize = 12, y = 0.9, x = 0.57)
plt.show()

In [None]:
categorical_cols = ['temp_of_extremities', 'peripheral_pulse', 'mucous_membrane','capillary_refill_time','pain','peristalsis','abdominal_distention','nasogastric_tube','nasogastric_reflux','rectal_exam_feces','abdomen','abdomo_appearance','lesion_2','surgery', 'age', 'surgical_lesion', 'lesion_3', 'cp_data']
threshold = .05

print(f'{"Column":<25} | Test result')
print('----------------------------------------')

for column in categorical_cols:
    # Create a contingency table
    contingency_table = pd.crosstab(train[column], train['outcome'])
    
    # Perform the Chi-Square test
    chi2, p, _, _ = stats.chi2_contingency(contingency_table)
    
    print(f'{column:<25} |   ', '\033[32mPassed' if p < threshold else '\033[31mFailed', '\033[0m')

In [None]:
train.drop('lesion_3', axis=1, inplace=True)

In [None]:
X_train = train.drop(columns='outcome')
y_train = train[['outcome']]

## Pipelines

In [None]:
numerical_pipeline = make_pipeline(
#     SimpleImputer(strategy='mean'), # Tree based models like the LGBM deal with missing values better than SimpleImputer
    QuantileTransformer(output_distribution='normal', random_state=42),
    StandardScaler()
)

categorical_pipeline = make_pipeline(
#     SimpleImputer(strategy='most_frequent'),
    OrdinalEncoder(handle_unknown='use_encoded_value' ,unknown_value=10)
)

In [None]:
transformer = make_column_transformer(
    (
        numerical_pipeline,
        make_column_selector(dtype_include=np.number) # We want to apply numerical_pipeline only on numerical columns
    ),
    (
        categorical_pipeline,
        make_column_selector(dtype_include=object) # We want to apply categorical_pipeline only on object (string) columns
    ),
    remainder='passthrough', # If any column where missed then don't drop them - we take care of every column so this line is not necessery
    verbose_feature_names_out=False # if False transformer won't add prefixes (name of the transformer that generated specific feature) to column names, column names are shorter that way
)

transformer

In [None]:
target_encoder = OrdinalEncoder(categories=[['died', 'euthanized', 'lived']])

In [None]:
X_train = transformer.fit_transform(X_train)
y_train = target_encoder.fit_transform(y_train).ravel()

In [None]:
X_train = pd.DataFrame(data=X_train, columns=transformer.get_feature_names_out(), index=train.index)
X_train.head()

In [None]:
X_train_optuna, X_val_optuna, y_train_optuna, y_val_optuna = train_test_split(X_train, y_train, train_size=0.9)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
# def objective(trial):
# #     model = LGBMClassifier(
# #         n_estimators = trial.suggest_int('n_estimators', 32, 1024),
# #         learning_rate = trial.suggest_float('learning_rate', 0.001, 0.5),
# #         max_depth = trial.suggest_int('max_depth', 1, 10),
# #         num_leaves = trial.suggest_int('num_leaves', 2, 1024),
# #         reg_lambda  = trial.suggest_float('reg_lambda', 0.001, 10),
# #         reg_alpha = trial.suggest_float('reg_alpha', 0, 10),
# #         subsample = trial.suggest_float('subsample', 0.001, 1),
# #         colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1),
# #         min_child_samples = trial.suggest_int('min_child_samples', 2, 1024),
# #         min_child_weight = trial.suggest_int('min_child_weight', 1, 10),
# #         objective = trial.suggest_categorical('objective', ['multiclass']),
# #         metric = trial.suggest_categorical('metric', ['multi_logloss']),
# #         boosting_type = trial.suggest_categorical('boosting_type', ['gbdt','goss']),
# #     )
    
# #     model = CatBoostClassifier(
# #         iterations = trial.suggest_int('iterations', 32, 1024),
# #         learning_rate = trial.suggest_float('learning_rate', 0.001, 0.3),
# #         depth = trial.suggest_int('depth', 1, 10),
# #         l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 0.01, 10),
# #         grow_policy = trial.suggest_categorical('grow_policy', ['Depthwise']),
# #         bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian']),
# #         od_type = trial.suggest_categorical('od_type', ['Iter']),
# #         eval_metric = trial.suggest_categorical('eval_metric', ['TotalF1']),
# #         loss_function = trial.suggest_categorical('loss_function', ['MultiClass']),
# #         random_state = trial.suggest_categorical('random_state', [42]),
# #         verbose = trial.suggest_categorical('verbose', [0])
# #     )

# #     model = XGBClassifier(
# #         eta = trial.suggest_float('eta', 0.001, 0.3),
# #         n_estimators = trial.suggest_int('n_estimators', 32, 1024),
# #         max_depth = trial.suggest_int('max_depth', 1, 10),
# #         reg_lambda = trial.suggest_float('reg_lambda', 0.01, 10),
# #         subsample = trial.suggest_float('subsample', 0.01, 1),
# #         min_child_weight = trial.suggest_int('min_child_weight', 1, 10),
# #         colsample_bytree = trial.suggest_float('colsample_bytree', 0.01, 1),
# #         objective = trial.suggest_categorical('objective', ['multi:softmax'])
# #     )

# # "max_depth":4,          
# #     "max_iter":80,         
# #     "learning_rate":0.1,     
# #     "random_state":42,   
# #     "scoring":'f1_micro',          
# #     "max_leaf_nodes" : 21,
# #     "l2_regularization" : 0.1

#     model = HistGradientBoostingClassifier(
#         max_depth = trial.suggest_int('max_depth', 1, 10),
#         max_iter = trial.suggest_int('max_iter', 2, 1024),
#         learning_rate = trial.suggest_float('learning_rate', 0.01, 1),
#         random_state=42, 
#         scoring = trial.suggest_categorical('scoring', ['f1_micro']),
#         max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 3, 1024),
#         l2_regularization = trial.suggest_float('l2_regularization', 0.01, 1),
#     )
    
#     model.fit(
#         X_train_optuna, y_train_optuna,
# #         eval_set=[(X_train_optuna, y_train_optuna), (X_val_optuna, y_val_optuna)],
# #         verbose=False
#     )
    
#     return f1_score(y_val_optuna, model.predict(X_val_optuna), average='micro') # micro F1 is used in this competitons for evaluation so we will use it for hyperparameter optimization

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)

# best_hyperparams = study.best_params

In [None]:

# best_hyperparams

In [None]:
best_hyperparams_lgbm={'n_estimators': 571,
 'learning_rate': 0.009567264504750483,
 'max_depth': 8,
 'num_leaves': 528,
 'reg_lambda': 4.943892160945249,
 'reg_alpha': 4.951692722456863,
 'subsample': 0.5161202709895614,
 'colsample_bytree': 0.6329309337468372,
 'min_child_samples': 2,
 'min_child_weight': 2,
 'objective': 'multiclass',
 'metric': 'multi_logloss',
 'boosting_type': 'goss'}

best_hyperparams_cb={'iterations': 633,
 'learning_rate': 0.23815686060214483,
 'depth': 2,
 'l2_leaf_reg': 6.097299159781945,
 'grow_policy': 'Depthwise',
 'bootstrap_type': 'Bayesian',
 'od_type': 'Iter',
 'eval_metric': 'TotalF1',
 'loss_function': 'MultiClass',
 'random_state': 42,
 'verbose': 0}


best_hyperparams_xgb = {'eta': 0.19046183608938072,
 'n_estimators': 54,
 'max_depth': 7,
 'reg_lambda': 8.406857972859333,
 'subsample': 0.6727156942479483,
 'min_child_weight': 3,
 'colsample_bytree': 0.4325971234965794,
 'objective': 'multi:softmax'}

best_hyperparams_hist ={'max_depth': 3,
 'max_iter': 82,
 'learning_rate': 0.19906722613772995,
 'random_state':42,
 'scoring': 'f1_micro',
 'max_leaf_nodes': 903,
 'l2_regularization': 0.6342976753397682}


In [None]:
models = [
    XGBClassifier(**best_hyperparams_xgb),
    LGBMClassifier(**best_hyperparams_lgbm),
    CatBoostClassifier(**best_hyperparams_cb),
    HistGradientBoostingClassifier(**best_hyperparams_hist)
]

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
eclf1 = VotingClassifier(estimators=[('xgb', models[0]), ('lgbm', models[1]), 
                                     ('cb', models[2]),('hgbc', models[3])], voting='soft')

In [None]:
ecmod1=eclf1.fit(X_train, y_train)

In [None]:
# np.atleast_2d(y_train).reshape(1235,1)

In [None]:
ecmod1.predict(np.array(X_train))

In [None]:
f1_score(y_train, ecmod1.predict(X_train), average='micro')

In [None]:
# cross_val_score(ecmod1, X_train, y_train, cv=5, scoring='f1_micro').sum() / 5

In [None]:
# eclf2 = VotingClassifier(estimators=[('xgb', models[0]), ('lgbm', models[1]), 
#                                      ('cb', models[2]),('hgbc', models[3])], voting='hard')

# ecmod2=eclf2.fit(X_train, y_train)

# ecmod2.predict(np.array(X_train))

In [None]:
# for model in models:
#     cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_micro').sum() / 5

#     print(f'{model.__class__.__name__} micro F1 cross-validation score: {cv_score:.3f}')

In [None]:
## Training

# for model in models:
#     model.fit(X_train, y_train)

In [None]:
# for model in models:
#     train_score = f1_score(y_train, model.predict(X_train), average='micro')

#     print(f'{model.__class__.__name__} micro F1 training score: {train_score:.3f}')

## Submission

In [None]:
test.drop('lesion_3', axis=1, inplace=True)

X_test = pd.DataFrame(data=transformer.transform(test), columns=transformer.get_feature_names_out(), index=test.index)

In [None]:
# predictions = []
# for model in models:
#     predictions.append(model.predict(X_test).ravel()) # CatBoostClassifier's predictions are of shape (n ,1) and not (n, ) like other models so we have to use .ravel()

# # Mode doesn't work on python lists only on ndarray (numpy arrays)
# predictions = np.array(predictions)

# # Take the most frequent prediction out of 3 models
# # final_predictions, _ = stats.mode(predictions, axis=0)

In [None]:
# final_predictions = np.median(predictions,axis=0)

In [None]:
final_predictions=ecmod1.predict(X_test)

In [None]:
# [int(a) for a in np.median(predictions,axis=0)]

In [None]:
# final_predictions

In [None]:
submission = pd.read_csv('/kaggle/input/playground-series-s3e22/sample_submission.csv', index_col='id')

submission['outcome'] = target_encoder.inverse_transform(final_predictions.reshape(-1, 1)).ravel()

submission.to_csv('/kaggle/working/submission.csv')

In [None]:
# target_encoder.inverse_transform(final_predictions.reshape(-1, 1)).ravel()