## Contents

1. [Initial Data Exploration](https://www.kaggle.com/code/valentinbelyaev/1-0-intitial-eda-playground-series-s3e24)
2. [Building Baseline Models](https://www.kaggle.com/code/valentinbelyaev/2-0-baseline-model-playground-series-s3e24)
3. [Model ensembles & handle outliers](https://www.kaggle.com/valentinbelyaev/3-0-model-ensemble-playground-series-s3e24)
4. [Tunning XGBoost hyperparameters](https://www.kaggle.com/valentinbelyaev/4-0-model-ensemble-playground-series-s3e24)

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("/kaggle/input/playground-series-s3e24/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s3e24/test.csv")

In [3]:
df_train.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1


In [4]:
df_test.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,159256,40,165,70,84.0,1.2,1.2,1,1,130,...,186,49,115,14.2,1,0.9,19,25,32,0
1,159257,80,160,60,93.0,1.0,1.0,2,2,144,...,158,35,104,13.0,1,1.1,20,12,24,0
2,159258,60,170,70,86.5,0.6,0.7,1,1,117,...,173,39,88,15.4,1,1.4,38,60,36,0
3,159259,40,160,50,67.0,0.3,0.4,1,1,116,...,47,75,128,14.5,1,0.6,25,18,10,1
4,159260,40,170,75,89.4,1.0,0.9,1,1,132,...,100,39,123,16.5,1,1.0,30,39,27,1


## Train & Test split

In [5]:
X, y = df_train.drop(columns=['id', 'smoking'], axis=1), df_train['smoking']

In [6]:
X_test = df_test.drop(columns=['id'], axis=1)
test_ids = df_test['id']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
print(f"Train shape: {X_train.shape}, {y_train.shape} | Validation shape: {X_valid.shape}, {y_valid.shape}")

Train shape: (111479, 22), (111479,) | Validation shape: (47777, 22), (47777,)


## Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [10]:
from sklearn.metrics import accuracy_score

print("Random Forest")

y_pred_train = rf.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy score on training set: {accuracy_train*100}%")

y_pred_valid = rf.predict(X_valid)
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy score on validation set: {accuracy_valid*100}%")

Random Forest
Accuracy score on training set: 99.99730891019834%
Accuracy score on validation set: 77.33637524331792%


In [11]:
y_pred_test = rf.predict(X_test)

predictions = pd.DataFrame({
    "id": test_ids,
    "smoking": y_pred_test
})

predictions.to_csv("baseline_random_forest.csv", index=False)

## Gradient Boosting (sklearn)

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [13]:
print("Gradient Boosting (sklearn)")

y_pred_train = gb.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy score on training set: {accuracy_train*100}%")

y_pred_valid = gb.predict(X_valid)
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy score on validation set: {accuracy_valid*100}%")

Gradient Boosting (sklearn)
Accuracy score on training set: 77.79402398658043%
Accuracy score on validation set: 77.24428072084895%


In [14]:
y_pred_test = gb.predict(X_test)

predictions = pd.DataFrame({
    "id": test_ids,
    "smoking": y_pred_test
})

predictions.to_csv("baseline_gradient_boosting.csv", index=False)

## XGBoost

In [15]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [16]:
print("XGBoost")

y_pred_train = xgb.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy score on training set: {accuracy_train*100}%")

y_pred_valid = xgb.predict(X_valid)
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy score on validation set: {accuracy_valid*100}%")

XGBoost
Accuracy score on training set: 82.2065142313799%
Accuracy score on validation set: 77.93289658203739%


In [17]:
y_pred_test = xgb.predict(X_test)

predictions = pd.DataFrame({
    "id": test_ids,
    "smoking": y_pred_test
})

predictions.to_csv("baseline_xgboost.csv", index=False)

## CatBoost

In [18]:
from catboost import CatBoostClassifier

cb = CatBoostClassifier(verbose=False)
cb.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7ab66060faf0>

In [19]:
print("CatBoost")

y_pred_train = cb.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy score on training set: {accuracy_train*100}%")

y_pred_valid = cb.predict(X_valid)
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy score on validation set: {accuracy_valid*100}%")

CatBoost
Accuracy score on training set: 81.64497349276544%
Accuracy score on validation set: 78.37871779308036%


In [20]:
y_pred_test = cb.predict(X_test)

predictions = pd.DataFrame({
    "id": test_ids,
    "smoking": y_pred_test
})

predictions.to_csv("baseline_catboost.csv", index=False)

## LightGBM

In [21]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

In [22]:
print("LightGBM")

y_pred_train = lgbm.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy score on training set: {accuracy_train*100}%")

y_pred_valid = lgbm.predict(X_valid)
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
print(f"Accuracy score on validation set: {accuracy_valid*100}%")

LightGBM
Accuracy score on training set: 79.17545008476932%
Accuracy score on validation set: 77.97057161395651%


In [23]:
y_pred_test = lgbm.predict(X_test)

predictions = pd.DataFrame({
    "id": test_ids,
    "smoking": y_pred_test
})

predictions.to_csv("baseline_lightgbm.csv", index=False)

## Test scores

1. Random Forest - 0.7786
2. Gradient Boosting - 0.7807
3. XGBoost - 0.7827
4. CatBoost - 0.7894
5. LightGBM - 0.7857