# Import Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import fetch_california_housing

from sklearn.feature_extraction.text import CountVectorizer

import optuna
from optuna.samplers import TPESampler

In [None]:
train_set = pd.read_csv(r"D:\source\repos\Kaggle_Tabular_Playground_Series-ML\Jan-2023-S2\data\train.csv")
test_set = pd.read_csv(r"D:\source\repos\Kaggle_Tabular_Playground_Series-ML\Jan-2023-S2\data\test.csv")
sample_sub = pd.read_csv(r"D:\source\repos\Kaggle_Tabular_Playground_Series-ML\Jan-2023-S2\data\sample_submission.csv")

In [None]:
columns_to_vectorize = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for vector_target in columns_to_vectorize:
    vectorizer = CountVectorizer()
    vectorizer.fit_transform(train_set[vector_target])
    train_set[f'{vector_target}_v'] = vectorizer.transform(train_set[vector_target]).toarray().argmax(axis=1)[:,None]
    vectorizer.fit_transform(test_set[vector_target])
    test_set[f'{vector_target}_v'] = vectorizer.transform(test_set[vector_target]).toarray().argmax(axis=1)[:,None]

In [None]:
train_set

In [None]:
test_set

In [None]:
train_set.columns

In [None]:
print('train_set shape: ', train_set.shape)
print('test_set shape: ', test_set.shape)

In [None]:
train_set.info()

In [None]:
train_set.describe()

## Missing Data

In [None]:
total = train_set.isnull().sum().sort_values(ascending=False)
percent = (train_set.isnull().sum()/train_set.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(10)

## Data visualisation

In [None]:
features = ['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'gender_v', 'ever_married_v', 'work_type_v', 'Residence_type_v', 'smoking_status_v']
target = ['stroke']

In [None]:
fig, axes = plt.subplots(3,4, figsize=(20, 12))
for i, j in zip(features+target, axes.flatten()):
    sns.histplot(train_set[i], ax=j)
plt.show()

### Correlation matrix (heatmap style)

In [None]:
corrmat = train_set[features+target].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, square=True, annot=True, fmt='.2f', cmap='seismic', vmin=-1, vmax=1)

# Train Model

In [None]:
import lightgbm as lgbm
from xgboost import XGBRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import roc_curve, auc,recall_score,precision_score

In [None]:
scaler = MinMaxScaler().fit(train_set[features])
X = scaler.transform(train_set[features])
X_test = scaler.transform(test_set[features])

scaler = MinMaxScaler().fit(train_set[target])
Y = scaler.transform(train_set[target])

In [None]:
kf = KFold(n_splits=10, random_state=1, shuffle=True)
clfs = []
err = []

In [None]:
y_min = Y.min()
y_max = Y.max()

print(y_min, y_max)

def my_rmse(y_true, y_hat):
    y_true[y_true < y_min] = y_min
    y_true[y_true > y_max] = y_max
    
    y_hat[y_hat < y_min] = y_min
    y_hat[y_hat > y_max] = y_max
    
    y_true_nan = np.isnan(y_true)
    y_hat_nan = np.isnan(y_hat)
    
    if y_true_nan.sum() > 0:
        print(y_true_nan.sum())
        np.where(y_true_nan, np.ma.array(y_true, mask=np.isnan(y_true)).mean(axis=0), y_true)
    if y_hat_nan.sum() > 0:
        print(y_hat_nan.sum())
        np.where(y_hat_nan, np.ma.array(y_hat, mask=np.isnan(y_hat)).mean(axis=0), y_hat)
    
    return mean_squared_error(y_true, y_hat, squared=False)

In [None]:
def xgb_objective(trial):
    # Split the train data for each trial.
    X_train, X_valid, y_train, y_valid = train_test_split(X, Y, stratify=Y, test_size=0.4)

    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 4, 20), # Extremely prone to overfitting!
        'n_estimators': trial.suggest_int('n_estimators', 2, 100, 1), # Extremely prone to overfitting!
        'eta': trial.suggest_float('eta', 0.0007, 0.113), # Most important parameter.
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 40), # L2 regularization

    } 
    
    reg = xgb.XGBModel(
        # These parameters should help with trial speed.
        objective='binary:logistic',
        tree_method='gpu_hist',
        booster='gbtree',
        predictor='gpu_predictor',
        n_jobs=4,
        eval_metric='auc',
        **param_grid
    )
    
    reg.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False)
    
    preds = reg.predict(X_valid)
    fpr, tpr, _ = roc_curve(y_valid, preds)
    roc_auc = auc(fpr, tpr)

    xgb_ranks[roc_auc] = reg
    
    # Returns the best RMSE for the trial.
    # Readers may want to try returning a cross validation score here.
    print(roc_auc)
    return roc_auc

In [None]:
xgb_ranks = {}

train_time = 1 * 60 * 60
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='XGBRegressor')
study.optimize(xgb_objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
trial.params

In [None]:
top_50 = sorted(list(xgb_ranks.keys()))[-50:]

In [None]:
top_50

In [None]:
me_preds = []
for key in top_50:
    me_preds.append(xgb_ranks[key].predict(test_scaled))

final_preds = np.stack(me_preds).mean(0)

In [None]:
XGB_submission = pd.DataFrame(data={'id': test_set.id, 'stroke': final_preds})
XGB_submission.to_csv(fr'D:\source\repos\Kaggle_Tabular_Playground_Series-ML\Jan-2023-S2\XGB.csv', index=False)

In [None]:
XGB_submission

In [None]:
def lgbm_objective(trial):
    # Split the train data for each trial.
    X_train, X_valid, y_train, y_valid = train_test_split(X, Y, stratify=Y, test_size=0.4)

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 100, 1000), 
        'num_leaves': trial.suggest_int('num_leaves', 100, 10000),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10), 
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 1000), 
        'reg_lambda': trial.suggest_int('reg_lambda', 1, 100), 
        'n_estimators': trial.suggest_int('n_estimators', 10, 100000), 
    } 
    

    clf = lgbm.LGBMRegressor(**param_grid,
                             metric='AUC',
                             random_state=1)
    
    clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgbm.early_stopping(100, verbose=True)],
            verbose=False)
    preds = clf.predict(X_valid)
    
    #rmse = mean_squared_error(y_val, preds, squared=False)
    fpr, tpr, _ = roc_curve(y_valid, preds)
    roc_auc = auc(fpr, tpr)

    lgbm_ranks[roc_auc] = clf
    
    # Returns the best RMSE for the trial.
    # Readers may want to try returning a cross validation score here.
    print(roc_auc)
    return roc_auc

In [None]:
lgbm_ranks = {}

train_time = 1 * 60 * 60
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='XGBRegressor')
study.optimize(lgbm_objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
trial.params

In [None]:
top_50 = sorted(list(lgbm_ranks.keys()))[-50:]
top_50

In [None]:
me_preds = []
for key in top_50:
    me_preds.append(lgbm_ranks[key].predict(test_scaled))

final_preds = np.stack(me_preds).mean(0)

In [None]:
LGBM_submission = pd.DataFrame(data={'id': test_set.id, 'stroke': final_preds})
LGBM_submission.to_csv(fr'D:\source\repos\Kaggle_Tabular_Playground_Series-ML\Jan-2023-S2\LGBM.csv', index=False)

In [None]:
def cat_objective(trial):
    # Split the train data for each trial.
    X_train, X_valid, y_train, y_valid = train_test_split(X, Y, stratify=Y, test_size=0.4)

    param_grid = {
        'depth': trial.suggest_int('depth', 1, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'rsm': trial.suggest_float('rsm', 0.001, 0.9),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100), 
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 20),
        'random_strength': trial.suggest_float('random_strength', 0.001, 0.9),
    } 
    

    clf = CatBoostRegressor(iterations=20000,
                            **param_grid,
                            bootstrap_type='Bernoulli',
                            grow_policy='SymmetricTree',
                            #loss_function='Logloss',
                            eval_metric='AUC',
                            task_type="CPU",
                            random_state=1,)
    
    clf.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=100, verbose=1000)
    preds = clf.predict(X_valid)
    
    #rmse = mean_squared_error(y_val, preds, squared=False)
    fpr, tpr, _ = roc_curve(y_valid, preds)
    roc_auc = auc(fpr, tpr)

    cat_ranks[roc_auc] = clf
    
    # Returns the best RMSE for the trial.
    # Readers may want to try returning a cross validation score here.
    print(roc_auc)
    return roc_auc

In [None]:
cat_ranks = {}

train_time = 1 * 60 * 60
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name='XGBRegressor')
study.optimize(cat_objective, timeout=train_time)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('\tValue: {}'.format(trial.value))
print('\tParams: ')
for key, value in trial.params.items():
    print('\t\t{}: {}'.format(key, value))

In [None]:
trial.params

In [None]:
top_50 = sorted(list(cat_ranks.keys()))[-50:]
top_50

In [None]:
me_preds = []
for key in top_50:
    me_preds.append(cat_ranks[key].predict(test_scaled))

final_preds = np.stack(me_preds).mean(0)

In [None]:
cat_submission = pd.DataFrame(data={'id': test_set.id, 'stroke': final_preds})
cat_submission.to_csv(fr'D:\source\repos\Kaggle_Tabular_Playground_Series-ML\Jan-2023-S2\CAT.csv', index=False)

In [None]:
cat_submission

In [None]:
final_test_preds = XGB_submission['stroke']*0.60 + LGBM_submission['stroke']*0.00 + cat_submission['stroke']*0.40
final_submission = pd.DataFrame(data={'id': test_set.id, 'stroke': final_test_preds})
final_submission.to_csv(fr'D:\source\repos\Kaggle_Tabular_Playground_Series-ML\Jan-2023-S2\final_submission.csv', index=False)

In [None]:
kf = KFold(n_splits=10, random_state=1, shuffle=True)
err = []

for i, (train_index, val_index) in enumerate(kf.split(train_set)):
    X_train, X_val = train_scaled[train_index, :], train_scaled[val_index, :]
    y_train, y_val = train_set['stroke'][train_index], train_set['stroke'][val_index]
    
    clf = CatBoostRegressor(iterations=20000,
                            depth=9,
                            learning_rate=0.01,
                            rsm=0.88,
                            subsample=0.795,
                            min_data_in_leaf=35,
                            l2_leaf_reg=8,
                            random_strength=0.63,
                            bootstrap_type='Bernoulli',
                            grow_policy='SymmetricTree',
                            #loss_function='Logloss',
                            eval_metric='AUC',
                            task_type="CPU",
                            random_state=1,)
    
    clf.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=1000)
    preds = clf.predict(X_val)
    
    #rmse = mean_squared_error(y_val, preds, squared=False)
    fpr, tpr, _ = roc_curve(y_val, preds)
    roc_auc = auc(fpr, tpr)
    err.append(roc_auc)
    clfs.append(clf)
    print(f'roc_auc on fold {i}: {roc_auc}')
    print('-'*50)

print(f'Average roc_auc (five fold): {sum(err)/10}')

In [None]:
len(clfs)

## feature importance

In [None]:
imp = np.zeros(11)
for clf in clfs[:10]:
    imp+= clf.feature_importances_
    
print('----------------------------XGBoost----------------------------')
plt.barh([features[i] for i in np.argsort(imp/10)], sorted(imp/10))
plt.show()

In [None]:
imp = np.zeros(11)
for clf in clfs[10:20]:
    imp+= clf.feature_importances_
    
print('----------------------------LGBM----------------------------')
plt.barh([features[i] for i in np.argsort(imp/10)], sorted(imp/10))
plt.show()

In [None]:
imp = np.zeros(11)
for clf in clfs[20:30]:
    imp+= clf.feature_importances_
    
print('----------------------------CatBoost----------------------------')
plt.barh([features[i] for i in np.argsort(imp/10)], sorted(imp/10))
plt.show()

# Making submission

In [None]:
test_preds1 = []
test_preds2 = []
test_preds3 = []

for clf in clfs[:10]:
    preds = clf.predict(test_scaled)
    test_preds1.append(preds)
    
for clf in clfs[10:20]:
    preds = clf.predict(test_scaled)
    test_preds2.append(preds)
    
for clf in clfs[20:30]:
    preds = clf.predict(test_scaled)
    test_preds3.append(preds)

In [None]:
clfs[:5]

In [None]:
test_preds3

In [None]:
test_preds1 = np.stack(test_preds1).mean(0)
test_preds2 = np.stack(test_preds2).mean(0)
test_preds3 = np.stack(test_preds3).mean(0)

In [None]:
attempts = [
    [0.2, 0.6, 0.2],
    [0.6, 0.2, 0.6],
    [0.3, 0.3, 0.4],
    [0.3, 0.4, 0.3],
    [0.5, 0.3, 0.2]
]

In [None]:
attempts = [
    [0.6, 0.3, 0.1],
    [0.7, 0.2, 0.1],
    [0.7, 0.1, 0.2],
    [0.6, 0.2, 0.2],
    [0.6, 0.1, 0.3]
]

In [None]:
attempts = [
    [0.6, 0.25, 0.15],
    [0.6, 0.2, 0.2],
    [0.6, 0.15, 0.25],
    [0.65, 0.15, 0.2],
    [0.65, 0.1, 0.25]
]

In [None]:
for i, row in enumerate(attempts):
    test_preds = test_preds1*row[0] + test_preds2*row[1] + test_preds3*row[2]
    submission = pd.DataFrame(data={'id': test_set.id, 'MedHouseVal': test_preds})
    submission.to_csv(f'submission{i}.csv', index=False)

In [None]:
test_preds = test_preds1*0.60 + test_preds2*0.00 + test_preds3*0.40
submission = pd.DataFrame(data={'id': test_set.id, 'stroke': test_preds})
submission.to_csv(fr'D:\source\repos\Kaggle_Tabular_Playground_Series-ML\Jan-2023-S2\submission.csv', index=False)

In [None]:
submission.describe()

In [None]:
submission = pd.DataFrame(data={'id': test_set.id, 'MedHouseVal': test_preds})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

**If you found this notebook useful, please upvote!**  
**Thank you!**🙏