# Loading Libraries and Datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from category_encoders import OneHotEncoder, GLMMEncoder, TargetEncoder, CatBoostEncoder
from sklearn import set_config
from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, f1_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import optuna

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [2]:
train_df = pd.read_csv('train.csv', index_col = 'id')
test_df = pd.read_csv('test.csv', index_col = 'id')
orig_train = pd.read_csv('original.csv')

# Descriptive Statistics

Let's begin by taking a peek at our original training dataset first

In [3]:
train_df.head()

Unnamed: 0_level_0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


So we have 1235 rows at most, with some of them having missing value... This won't be an easy competition for sure. Another interesting point is that most features are categorical. Even some numerical features can be considered as categorical.

In [4]:
test_df.head()

Unnamed: 0_level_0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1235,no,adult,534053,38.6,40.0,20.0,normal,normal,normal_pink,less_3_sec,...,distend_small,42.0,7.5,clear,2.3,no,0,0,0,no
1236,yes,adult,528469,38.2,112.0,48.0,cool,reduced,bright_pink,more_3_sec,...,distend_small,44.0,6.0,serosanguious,2.6,no,2208,0,0,yes
1237,yes,adult,528178,37.7,66.0,12.0,cool,normal,bright_red,less_3_sec,...,distend_small,31.5,6.0,cloudy,1.6,yes,2205,0,0,yes
1238,no,adult,534784,37.1,88.0,20.0,cool,reduced,pale_cyanotic,less_3_sec,...,distend_large,75.0,81.0,,1.0,yes,1400,0,0,no
1239,yes,adult,529840,38.3,50.0,12.0,,normal,bright_pink,less_3_sec,...,distend_small,37.0,6.8,cloudy,2.6,yes,2208,0,0,yes


Again, we have a lot of missing values in the test dataset. Another funny thing is that, we only have one unique value for `lesion_3` feature in the test dataset.

Let's try grouping the categorical and numerical features now.

In [5]:
numerical_features = test_df._get_numeric_data().drop('lesion_3', axis = 1).columns
categorical_features = test_df.drop(numerical_features, axis = 1).drop('lesion_3', axis = 1).columns

The train and test datasets have similar distribution as expected, with original dataset being somewhat different.

There are... a lot of information to take in here. For example, we can see that most horses have taken surgeries before. Another one will be that most of the horses are adult.

Another peculiarity you might notice is that there are some categories in train dataset that aren't in test dataset. Features with such oddity are `peristalsis`, `nasogastric_reflux`, `rectal_exam_feces`. However, the missing values might be located within those null values across features. There is also the issue of one of the value in `pain` being different between training dataset (slight) and test dataset (moderate). We don't know if both values are supposed to exist or only one of them is.

# Metrics

As a reminder, we are using F1-score with micro-averaging for metric. The formula for F1-score in general is as follows

$$F_1 = 2 \frac{\mathrm{precision} \cdot \mathrm{recall}}{\mathrm{precision} + \mathrm{recall}} = \frac{2\mathrm{tp}}{2\mathrm{tp} + \mathrm{fp} + \mathrm{fn}}$$

In [6]:
def f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average = 'micro')

# Preprocessing

Because we know that there is only one unique value of `lesion_3` in test dataset, we can just remove any rows that have different value and drop the feature to make it simpler.

In [7]:
preprocessed_train = train_df[train_df.lesion_3 == 0].drop('lesion_3', axis = 1)
preprocessed_test = test_df.drop('lesion_3', axis = 1)
preprocessed_orig_train = orig_train[orig_train.lesion_3 == 0].drop('lesion_3', axis = 1)

# Preparation

This is where we start preparing everything if we want to start building machine learning models.

In [8]:
X = pd.concat([preprocessed_train, preprocessed_orig_train])
y = X.pop('outcome')

y_map = {k : v for v, k in enumerate(train_df.outcome.unique())}
y_reverse_map = {v : k for k, v in y_map.items()}

y = y.map(y_map)

OH = OneHotEncoder(cols = categorical_features)
X = OH.fit_transform(X, y)
preprocessed_test = OH.transform(preprocessed_test)

f1_scorer = make_scorer(f1)

seed = 42
splits = 5
repeats = 4
rskf = RepeatedStratifiedKFold(n_splits = splits, random_state = seed, n_repeats = repeats)
skf = StratifiedKFold(n_splits = splits, random_state = seed, shuffle = True)
np.random.seed(seed)

# Model

Let's start by evaluating the performance of our model first without any data modification.

In [9]:
def cross_val_score(model, cv = rskf, label = '', include_original = False):
    
    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X)))
    #train_predictions = np.zeros((len(sample)))
    train_scores, val_scores = [], []
    
    #training model, predicting prognosis probability, and evaluating metrics
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        if include_original:
            X_train = pd.concat([X_train, OH.transform(preprocessed_orig_train.drop('outcome', axis = 1))])
            y_train = pd.concat([y_train, preprocessed_orig_train.outcome.map(y_map)])
        
        #train model
        model.fit(X_train, y_train)
        
        #make predictions
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
                  
        #val_predictions[val_idx] += val_preds / repeats
        
        #evaluate model for a fold
        train_score = f1(y_train, train_preds)
        val_score = f1(y_val, val_preds)
        
        #append model score for a fold to list
        train_scores.append(train_score)
        val_scores.append(val_score)
    
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f} | Train Score: {np.mean(train_scores):.5f} ± {np.std(train_scores):.5f} | {label}')
    
    return val_scores, val_predictions

# Inference and Submission

Finally, let's train our chosen model on the whole train dataset and do inference on the test dataset.

In [10]:
# import knn imputer

# Create the pipeline for your model
model = make_pipeline(SimpleImputer(), 
                      HistGradientBoostingClassifier(max_iter=244,
                                                     max_leaf_nodes=103,
                                                     learning_rate=0.07839950629804086,
                                                     min_samples_leaf=23,
                                                     random_state=42))

# Initialize score_list and oof_list
score_list, oof_list = pd.DataFrame(), pd.DataFrame()

# Define the label (make sure to define this)
label = "HistGradientBoosting_maxIter244_maxLeaf103"

# Run cross_val_score
score_list[label], _ = cross_val_score(model, label=label, include_original=True)

Val Score: 0.78299 ± 0.01816 | Train Score: 1.00000 ± 0.00000 | HistGradientBoosting_maxIter244_maxLeaf103


- rskf - Val Score: 0.78299 ± 0.01816 | Train Score: 1.00000 ± 0.00000 | HistGradientBoosting_maxIter244_maxLeaf103
- skf - Val Score: 0.78055 ± 0.02586 | Train Score: 1.00000 ± 0.00000 | HistGradientBoosting_maxIter244_maxLeaf103
- rskf/knn - Val Score: 0.77923 ± 0.01817 | Train Score: 1.00000 ± 0.00000 | HistGradientBoosting_maxIter244_maxLeaf103

In [11]:
model.fit(X, y)
submission = preprocessed_test.copy()
submission['outcome'] = model.predict(submission)
submission.outcome = submission.outcome.map(y_reverse_map)

In [None]:
submission.outcome.to_csv('submission.csv')