In [None]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 7.4 MB/s 
Collecting alembic
  Downloading alembic-1.8.0-py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 65.6 MB/s 
[?25hCollecting cliff
  Downloading cliff-3.10.1-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 8.7 MB/s 
Collecting colorlog
  Downloading colorlog-6.6.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting Mako
  Downloading Mako-1.2.0-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 7.2 MB/s 
[?25hCollecting autopage>=0.4.0
  Downloading autopage-0.5.1-py3-none-any.whl (29 kB)
Collecting pbr!=2.1.0,>=2.0.0
  Downloading pbr-5.9.0-py2.py3-none-any.whl (112 kB)
[K     |████████████████████████████████| 112 kB 53

In [None]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [None]:
# Analysis and Preprocessing
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Modeling and Evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import optuna
from lightgbm import LGBMClassifier

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/SEMESTER 4 COURSES/Artificial Intelligence/Prototype Project/Datasets/downsampled_df.csv')
df = pd.read_csv('/content/oversampled_df.csv')
df.head()

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Avg Glucose Level,Bmi,Work Type_Private,Work Type_Self-employed,Work Type_Govt_job,Work Type_children,Work Type_Never_worked,Residence Type_Urban,Residence Type_Rural,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Smoking Status_Unknown,Stroke
0,1,1.051434,0,1,1,2.706375,1.005086,0,0,1,0,0,0,1,0,1,0,0,1
1,0,0.78607,0,0,1,2.121559,-0.098981,0,0,0,1,0,1,0,0,0,1,0,1
2,1,1.62639,0,1,1,-0.005028,0.472536,0,0,1,0,0,1,0,0,0,1,0,1
3,0,0.255342,0,0,1,1.437358,0.719327,0,0,1,0,0,0,1,0,0,0,1,1
4,0,1.582163,1,0,1,1.501184,-0.631531,0,0,0,1,0,1,0,0,0,1,0,1


In [None]:
X = df.drop('Stroke', axis = 1)
y = df['Stroke']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 1)

In [None]:
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size = 0.2, 
                                                          stratify = y_train_full, random_state = 1)

    # Define potential hyperparameters
    params_grid = {'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-5, 100, log = True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-5, 100, log = True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 5000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'max_bin': trial.suggest_int('max_bin', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 200, 1000),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000)}

    # Make Prediction
    lgbm = LGBMClassifier(**params_grid)
    lgbm.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], verbose = 0, early_stopping_rounds = 100)
    predictions = lgbm.predict(X_valid)

    # Get the accuracy
    recall = recall_score(predictions, y_valid)
    return recall

In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 10)

[32m[I 2022-06-17 04:44:25,667][0m A new study created in memory with name: no-name-865971ed-de9a-4a30-bbb7-d9f3bdf94ffb[0m
[32m[I 2022-06-17 04:44:30,304][0m Trial 0 finished with value: 0.8903775883069428 and parameters: {'lambda_l1': 0.030649986480190607, 'lambda_l2': 0.013469831374964346, 'num_leaves': 386, 'max_depth': 5, 'max_bin': 337, 'learning_rate': 0.10179025996337786, 'min_data_in_leaf': 525, 'feature_fraction': 0.6413228111649174, 'bagging_fraction': 0.5153386031426539, 'bagging_freq': 3, 'min_child_samples': 94, 'n_estimators': 6753}. Best is trial 0 with value: 0.8903775883069428.[0m
[32m[I 2022-06-17 04:45:01,701][0m Trial 1 finished with value: 0.924812030075188 and parameters: {'lambda_l1': 3.375923636525349e-05, 'lambda_l2': 2.7106218792169736e-05, 'num_leaves': 4344, 'max_depth': 14, 'max_bin': 467, 'learning_rate': 0.029737799876625354, 'min_data_in_leaf': 318, 'feature_fraction': 0.7306531087307528, 'bagging_fraction': 0.7005078027255269, 'bagging_freq': 5

In [None]:
best_params = study.best_params
best_params

{'bagging_fraction': 0.9260262499752487,
 'bagging_freq': 4,
 'feature_fraction': 0.554404152192108,
 'lambda_l1': 0.0015395748402496933,
 'lambda_l2': 0.5766787304983864,
 'learning_rate': 0.16373463943037975,
 'max_bin': 447,
 'max_depth': 3,
 'min_child_samples': 17,
 'min_data_in_leaf': 457,
 'n_estimators': 4980,
 'num_leaves': 2450}

In [None]:
best_params_df = pd.DataFrame([best_params]).transpose()
best_params_df.reset_index(inplace = True)
best_params_df.columns = ['Parameter', 'Value']
best_params_df

Unnamed: 0,Parameter,Value
0,lambda_l1,0.00154
1,lambda_l2,0.576679
2,num_leaves,2450.0
3,max_depth,3.0
4,max_bin,447.0
5,learning_rate,0.163735
6,min_data_in_leaf,457.0
7,feature_fraction,0.554404
8,bagging_fraction,0.926026
9,bagging_freq,4.0


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size = 0.2,
                                                      stratify = y_train_full, random_state = 1)

lgbm_optuna = LGBMClassifier(**best_params)
lgbm_optuna.fit(X_train, y_train)
predictions = lgbm_optuna.predict(X_valid)
print(classification_report(predictions, y_valid))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       768
           1       0.95      0.94      0.94       788

    accuracy                           0.94      1556
   macro avg       0.94      0.94      0.94      1556
weighted avg       0.94      0.94      0.94      1556

