In [None]:
import csv
import numpy as np
import pandas as pd

In [None]:
x = []
y = []
with open ('/content/data-overweight.csv') as file:
  csv_reader = csv.reader(file)
  for row in csv_reader:
    if row[0]=='ID':
      continue
    datum = []
    datum.append(row[3])
    datum.append(row[4])
    if row[1]=='Female':
      datum.append(0)
    else:
      datum.append(1)
    datum.append(float(row[2]))
    bmi = float(row[4])/(float(row[3])**2)
    datum.append(bmi)
    if row[5]=='yes':
      datum.append(0)
    else:
      datum.append(1)
    if row[6]=='yes':
      datum.append(0)
    else:
      datum.append(1)
    datum.append(float(row[7]))
    datum.append(float(row[8]))
    if row[9]=='no':
      datum.append(0)
    elif row[9]=='Sometimes':
      datum.append(1)
    elif row[9]=='Frequently':
      datum.append(2)
    else:
      datum.append(3)
    #if row[10]=='yes':
      #datum.append(0)
    #else:
      #datum.append(1)
    datum.append(float(row[11]))
    if row[12]=='yes':
      datum.append(0)
    else:
      datum.append(1)
    datum.append(float(row[13]))
    datum.append(float(row[14]))
    if row[15]=='no':
      datum.append(0)
    elif row[15]=='Sometimes':
      datum.append(1)
    elif row[15]=='Frequently':
      datum.append(2)
    else:
      datum.append(3)
    if row[16]=='Walking':
      datum.append(0)
    elif row[16]=='Bike':
      datum.append(1)
    elif row[16]=='Motorbike':
      datum.append(2)
    elif row[16]=='Automobile':
      datum.append(3)
    else:
      datum.append(4)
    if row[17]=='Insufficient_Weight':
      y.append(0)
    elif row[17]=='Normal_Weight':
      y.append(1)
    elif row[17]=='Overweight':
      y.append(2)
    elif row[17]=='Obesity_Type_I':
      y.append(3)
    elif row[17]=='Obesity_Type_II':
      y.append(4)
    else:
      y.append(5)
    x.append(datum)

x = np.array(x)
y = np.array(y)

In [None]:
!pip install optuna

In [None]:
import optuna  # pip install optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from optuna.integration import LightGBMPruningCallback

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def objective(trial, X, y):
    """
    Objective function to be minimized.
    """
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 6,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    cv_scores = np.empty(10)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMClassifier(**param)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="multi_logloss",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "multi_logloss")
            ],  # Add a pruning callback
        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

In [None]:
!pip install shap

In [None]:
import shap
np.random.seed(0) # Reproducibility 
num_classes = 6

######## Use a dict to track the SHAP values of each observation per CV repitition 

shap_values_per_cv = dict()
for num_class in range(num_classes):
    ## Create keys for each Class
    shap_values_per_cv[num_class] = {} 
    ## Then, keys for each sample within each Class
    for sample in range(len(x)):
        shap_values_per_cv[num_class][sample] = {}

In [None]:
from statistics import mean 
from sklearn.metrics import accuracy_score
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_scores = np.empty(5)
for idx, (train_outer_idx, test_outer_idx) in enumerate(cv_outer.split(x, y)):
    X_train, X_test = x[train_outer_idx], x[test_outer_idx]
    y_train, y_test = y[train_outer_idx], y[test_outer_idx]

    study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
    func = lambda trial: objective(trial, X_train, y_train)
    study.optimize(func, n_trials=500)

    print(f"\tBest value (rmse): {study.best_value:.5f}")
    print(f"\tBest params:")
    for key, value in study.best_params.items():
        print(f"\t\t{key}: {value}")
    model = lgb.LGBMClassifier(**study.best_params)
    result = model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    cv_scores[idx] = accuracy

    explainer = shap.TreeExplainer(result) 
    shap_values = explainer.shap_values(X_test)
    # Extract SHAP information per fold per sample
    for j in range(num_classes):
        for k, test_index in enumerate(test_outer_idx):
            shap_values_per_cv[j][test_index] = shap_values[j][k]

In [None]:
# Establish lists to keep average Shap values, their Stds, and their min and max
average_shap_values, stds, ranges = [],[],[]

for i in range(num_classes):
    a, b, c = [],[],[]
    for j in range(len(shap_values_per_cv[0])):   
        df_per_obs = pd.DataFrame.from_dict(shap_values_per_cv[i][j]) # Get all SHAP values for sample number i
        # Get relevant statistics for every sample
        a.append(df_per_obs.mean(axis=1).values)
        b.append(df_per_obs.std(axis=1).values)
        c.append(df_per_obs.max(axis=1).values-df_per_obs.min(axis=1).values)
    average_shap_values.append(np.array(a)) 
    stds.append(b)
    ranges.append(c)

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()

shap.summary_plot(average_shap_values, x, feature_names = ['Height', 'Weight', 'Gender', 'Age', 'BMI', 'family_history', 'FCHCF', 'FCV',
                                                           'NMM', 'CFBM', 'Smoke', 'CCM', 'PAF', 'TUT', 'CA', 'Transportation']
                                        , class_names = ['Insufficient_Weight', 'Normal_Weight', 'Overweight',
                                                         'Obesity_Level_I', 'Obesity_Level_II', 'Obesity_Level_III'], show = False
                                        , class_inds = model.classes_)
plt.gcf().set_size_inches(20,10)
plt.show()

print("Nested Stratified Cross Validation Scores: " + str(cv_scores))
print("Average CV Score: " + str(mean(cv_scores)))
print("Number of CV Scores used in Average: " + str(len(cv_scores)))

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, x, y)
study.optimize(func, n_trials=500)

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [None]:
from sklearn.model_selection import KFold, cross_val_score
model = lgb.LGBMClassifier(**study.best_params)
skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(model, x, y, cv=skf)

In [None]:
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))