In [None]:
import pandas as pd
from IPython.display import display
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
import warnings
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
X = pd.read_csv('data/train.csv', index_col='id')
X_bis = pd.read_csv('data/train_bis.zip')
X = pd.concat([X, X_bis], ignore_index=True)
y = X.pop('outcome')
X_test = pd.read_csv('data/test.csv', index_col='id')
X.shape, y.shape, X_test.shape

In [None]:
X.head()

In [None]:
category_cols = X.select_dtypes('object').columns
numeric_cols = X.select_dtypes('number').columns
assert(len(category_cols) + len(numeric_cols) == len(X.columns))

## Categorical values checking

We check if some values are contained in the test data and not in the train data for object columns.
We replace the values in test and not in train by the most common value in test.

In [None]:
def build_missing_term_dict(cols, X_train, X_test):
    """Find the values the test set that are not in the training set."""
    missing_term_dict = {}
    for col in cols:
        missing_values = list(set(X_test[col].unique()) - set(X_train[col].unique()))
        if missing_values:
            missing_term_dict[col] = missing_values
    return missing_term_dict

In [None]:
missing_terms = build_missing_term_dict(category_cols, X, X_test)
missing_terms

In [None]:
# Replace missing terms with 'Nan'
for key, val in missing_terms.items():
    X_test[key] = X_test[key].replace(val, np.nan)

## Feature selection
We can start by analyzing the repartition of each features.
### categories

In [None]:
category_cols

In [None]:

print(len(category_cols))
fig, axes = plt.subplots(4, 4, figsize=(25, 20))
# Flatten the axes array for easier iteration
axes = axes.flatten()
for i, cat in enumerate(category_cols):
    sns.countplot(x=X[cat], hue=y, ax=axes[i])
    axes[i].set_title(f"Bar Plot for {cat}")
plt.tight_layout()
# Show the plots
plt.show()

We see that some features might have an important influence on the outcome, for example the age or surgery. In the end we decide to keep every categories at the moment.
We will however need to encode them.


surgery: ['yes' 'no'] 

age: ['adult' 'young']

temp_of_extremities: ['cool' 'cold' 'normal' 'warm' nan]

peripheral_pulse: ['reduced' 'normal' nan 'absent' 'increased']

mucous_membrane: ['dark_cyanotic' 'pale_cyanotic' 'pale_pink' 'normal_pink' 'bright_pink'
 'bright_red' nan]

capillary_refill_time: ['more_3_sec' 'less_3_sec' nan '3']

pain: ['depressed' 'mild_pain' 'extreme_pain' 'alert' 'severe_pain' nan 'slight']

peristalsis: ['absent' 'hypomotile' 'normal' 'hypermotile' nan 'distend_small']

abdominal_distention: ['slight' 'moderate' 'none' 'severe' nan]

nasogastric_tube: ['slight' 'none' 'significant' nan]

nasogastric_reflux: ['less_1_liter' 'more_1_liter' 'none' nan 'slight']

rectal_exam_feces: ['decreased' 'absent' nan 'normal' 'increased' 'serosanguious']

abdomen: ['distend_small' 'distend_large' 'normal' 'firm' nan 'other']

abdomo_appearance: ['serosanguious' 'cloudy' 'clear' nan]

surgical_lesion: ['yes' 'no']

cp_data: ['no' 'yes']

The ordered ones will be: 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'rectal_exam_feces', 'abdomen', 'abdomo_appearance'

We the define the following orderings encoding


In [None]:
# ordinal features
temp_of_extremities= {'cool':0,'cold':1 ,'normal':2, 'warm':3}
peripheral_pulse= {'absent':0,'reduced':1, 'normal':2  ,'increased':3}
mucous_membrane= {"normal_pink": 0,"bright_pink": 1,"pale_pink": 2, "bright_red": 3, "pale_cyanotic": 4, "dark_cyanotic": 5 }
capillary_refill_time= {'less_3_sec':0 , '3':1 ,'more_3_sec':2}
pain= {'slight':0, 'depressed':1 ,'mild_pain':2,'alert':3 ,'severe_pain':4,'extreme_pain':5 }
peristalsis= {'absent':0,'normal':1,'distend_small':2, 'hypomotile':3 ,'hypermotile':4 }
abdominal_distention= {'none':0, 'slight' :1,'moderate' :2,'severe':3}
nasogastric_tube= {'none':0, 'slight' :1,'significant':2}
nasogastric_reflux= {'none':0 , 'slight':1, 'less_1_liter':2, 'more_1_liter':3}
rectal_exam_feces= {'absent' :0, 'decreased':1,  'normal':2, 'increased':3 ,'serosanguious':4}
abdomen = {"other": 0,"firm": 1,"normal": 2,"distend_large": 3, "distend_small": 4}
abdomo_appearance = {'clear':0, 'serosanguious':1 ,'cloudy':2}
# non ordinal features
surgery = {'no':0, 'yes':1}
age = {'adult':0, 'young':1}
surgical_lesion = {'no':0, 'yes':1}
cp_data = {'no':0, 'yes':1}
outcome = {'lived':0, 'died':1 ,'euthanized':2}
outcome_reverse = {0:'lived', 1:'died' ,2:'euthanized'}

category_mapping = {'temp_of_extremities':temp_of_extremities, 'peripheral_pulse':peripheral_pulse, 'mucous_membrane':mucous_membrane, 'capillary_refill_time':capillary_refill_time, 'pain':pain, 'peristalsis':peristalsis, 'abdominal_distention':abdominal_distention, 'nasogastric_tube':nasogastric_tube, 'nasogastric_reflux':nasogastric_reflux, 'rectal_exam_feces':rectal_exam_feces, 'abdomen':abdomen, 'abdomo_appearance':abdomo_appearance, 'surgery':surgery, 'age':age, 'surgical_lesion':surgical_lesion, 'cp_data':cp_data}


### Numeric

In [None]:

print(len(numeric_cols))
fig, axes = plt.subplots(3, 4, figsize=(20, 20))
# Flatten the axes array for easier iteration
axes = axes.flatten()
for i, cat in enumerate(numeric_cols):
    sns.histplot(x=X[cat], hue=y, ax=axes[i], kde=True)
    axes[i].set_title(f"Bar Plot for {cat}")
plt.tight_layout()
# Show the plots
plt.show()

hosptital number seems hard to exploit like this so we will drop it atm.

lesion_2 and lesion_3 almost always have the same value, so we drop them.

In [None]:
to_drop = ['hospital_number', 'lesion_2', 'lesion_3']
try: 
    X.drop(columns=to_drop, inplace=True)
except KeyError:
    print("Columns already dropped")
try:
    X_test.drop(columns=to_drop, inplace=True)
except KeyError:
    print("Columns already dropped")

## Replacing missing value

We will replace missing values by the most frequent ones for categories, and with the median for numerical values

In [None]:
# TODO try both with median for numerical
for i in X.columns:    
    X[i].fillna(X[i].mode()[0], inplace=True)
    X_test[i].fillna(X[i].mode()[0], inplace=True)
assert X.isna().sum().sum() == 0
assert X_test.isna().sum().sum() == 0

## Mapping

we then map our categories to our defined mapping

In [None]:
X[category_cols] = X[category_cols].apply(lambda x: x.map(category_mapping[x.name]))
X_test[category_cols] = X_test[category_cols].apply(lambda x: x.map(category_mapping[x.name]))
X.head()

In [None]:
X_test.head()

In [None]:
y = y.map(outcome)
y.head()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training

## CatBoost classifier

In [162]:
from catboost import CatBoostClassifier
import optuna
def objective_cat(trial):
    """Define the objective function"""

    params = {
        "iterations" : trial.suggest_int("iterations", 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        "depth" : trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg" : trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        "bootstrap_type" : trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        "random_strength" : trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        "bagging_temperature" : trial.suggest_float("bagging_temperature", 0.0, 10.0),
        "od_type" : trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        "od_wait" : trial.suggest_int("od_wait", 10, 50),
        "verbose" : False
    }


    model_cat = CatBoostClassifier(**params)
    model_cat.fit(X_train, y_train)
    y_pred = model_cat.predict(X_val)
    return accuracy_score(y_val,y_pred)

In [163]:
study_cat = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_cat.optimize(objective_cat, n_trials=50,show_progress_bar=True)

Best trial: 9. Best value: 0.775244: 100%|██████████| 50/50 [00:23<00:00,  2.10it/s]


In [165]:
cat = CatBoostClassifier(**study_cat.best_params, verbose=False)
cat.fit(X_train, y_train)
y_pred = cat.predict(X_val)
print('Accuracy: ', accuracy_score(y_val, y_pred))

Accuracy:  0.7752442996742671


In [None]:
CatBoostClassifier_params = {"learning_rate": 0.015, "max_depth": 6, "iterations": 2000, "verbose": False}
model = CatBoostClassifier(**CatBoostClassifier_params)
clf = Pipeline(
    steps=[
        ("classifier", model),
        # ("classifier", grid_search),
    ]
)
clf.fit(X_train, y_train)
accuracy = clf.score(X_val, y_val)
print(f"Accuracy: {accuracy}")

In [None]:
y_pred = clf.predict(X_test)
y_pred = pd.Series(y_pred.flatten()).map(outcome_reverse)
# convert to a dataframe
submission = pd.DataFrame({'id': X_test.index, 'outcome': y_pred})
submission.to_csv('data/submission_hope_0.csv', index=False)

## LightGBM classifier

In [None]:
!pip install lightgbm --quiet
!pip install optuna --quiet

In [None]:
from lightgbm import LGBMClassifier
import optuna


def objective_lgb(trial):
    """Define the objective function"""

    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss', 
        'random_state': 42,
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 2000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
    }


    model_lgb = LGBMClassifier(**params)
    model_lgb.fit(X_train, y_train)
    y_pred = model_lgb.predict(X_val)
    return accuracy_score(y_val,y_pred)

In [None]:
study_lgb = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgb.optimize(objective_lgb, n_trials=50,show_progress_bar=True)

In [158]:
print('Best parameters', study_lgb.best_params)

Best parameters {'max_depth': 5, 'learning_rate': 0.20248805571738931, 'n_estimators': 450, 'min_child_weight': 2, 'subsample': 0.16160269960607387, 'colsample_bytree': 0.7056957943468938, 'reg_alpha': 0.4637763742002264, 'reg_lambda': 0.45344865264116246}


In [160]:
lgb = LGBMClassifier(**study_lgb.best_params)
lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_val)
print('Accuracy: ', accuracy_score(y_val, y_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 408
[LightGBM] [Info] Number of data points in the train set: 1227, number of used features: 24
[LightGBM] [Info] Start training from score -0.720410
[LightGBM] [Info] Start training from score -1.146181
[LightGBM] [Info] Start training from score -1.631689
Accuracy:  0.7752442996742671


In [161]:
y_pred = lgb.predict(X_test)
y_pred = pd.Series(y_pred.flatten()).map(outcome_reverse)
# convert to a dataframe
submission = pd.DataFrame({'id': X_test.index, 'outcome': y_pred})
submission.to_csv('data/submission_hope_lgb.csv', index=False)



## XGBoost Classifier

In [166]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import optuna
def objective_xg(trial):
    """Define the objective function"""

    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree']),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 300, 700),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 0.5),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }
    model_xgb = XGBClassifier(**params)
    model_xgb.fit(X_train, y_train)
    y_pred = model_xgb.predict(X_val)
    return accuracy_score(y_val,y_pred)

In [167]:
study_xgb = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_xgb.optimize(objective_xg, n_trials=50,show_progress_bar=True)

Best trial: 6. Best value: 0.788274: 100%|██████████| 50/50 [00:33<00:00,  1.48it/s]


In [168]:
xgb = XGBClassifier(**study_xgb.best_params)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_val)
print('Accuracy: ', accuracy_score(y_val, y_pred))

Accuracy:  0.7882736156351792


In [169]:
y_pred = xgb.predict(X_test)
y_pred = pd.Series(y_pred.flatten()).map(outcome_reverse)
# convert to a dataframe
submission = pd.DataFrame({'id': X_test.index, 'outcome': y_pred})
submission.to_csv('data/submission_hope_xgb.csv', index=False)

In [176]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier(estimators=[('cat', cat),
                                      ('lgbm', lgb), 
                                      ('xgb', xgb)], voting='soft')
voting.fit(X_train,y_train)
voting_pred = voting.predict(X_val)

print('Accuracy: ', accuracy_score(y_val, voting_pred))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 408
[LightGBM] [Info] Number of data points in the train set: 1227, number of used features: 24
[LightGBM] [Info] Start training from score -0.720410
[LightGBM] [Info] Start training from score -1.146181
[LightGBM] [Info] Start training from score -1.631689
Accuracy:  0.7850162866449512


In [177]:
y_pred = voting.predict(X_test)
y_pred = pd.Series(y_pred.flatten()).map(outcome_reverse)
# convert to a dataframe
submission = pd.DataFrame({'id': X_test.index, 'outcome': y_pred})
submission.to_csv('data/submission_hope_voting.csv', index=False)

