In [10]:
cd /content/drive/MyDrive/parttime/allmodels

/content/drive/MyDrive/parttime/allmodels


In [11]:
import pandas as pd
import numpy as np


In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0. Data preprocessing

In [13]:
data = pd.read_csv('/content/drive/MyDrive/parttime/peak_table.csv')

In [14]:
data.head()

Unnamed: 0,group,label,2007.149708,2011.916563,2022.180352,2026.246581,2050.086635,2069.461955,2083.405537,2094.926944,...,17201.72098,17736.69124,17866.5492,17969.22753,18227.08714,18465.30837,18561.8822,18655.69653,19197.86253,19982.44636
0,B,B1.1,3.06e-07,7e-06,3.1e-05,4.3e-05,5e-06,8e-06,5e-05,1.4e-05,...,9e-06,8e-06,5.37e-07,7e-06,8.98e-06,8e-06,1.3e-05,1.3e-05,8e-06,1.09e-06
1,B,B1.2,2.12e-07,1e-06,3.5e-05,5.2e-05,5e-06,5e-06,3.2e-05,1.3e-05,...,4e-06,1.7e-05,1.01e-05,1.5e-05,2.24e-07,1.4e-05,1e-05,1.3e-05,1.3e-05,1.67e-05
2,B,B1.3,1.22e-06,9e-06,3.9e-05,7e-05,2e-06,9e-06,4.3e-05,7e-06,...,1.3e-05,1e-05,1.56e-05,9e-06,1.01e-05,6e-06,1.6e-05,2e-05,1.2e-05,2.41e-08
3,B,B10.1,7.67e-06,5e-06,2.9e-05,4.6e-05,1.4e-05,1.3e-05,2.6e-05,1.7e-05,...,1.1e-05,4e-06,3.23e-06,3e-06,9.71e-06,9e-06,1.2e-05,6e-06,1e-05,1.31e-05
4,B,B10.2,3.08e-06,2e-06,2.7e-05,3.9e-05,1.2e-05,1.3e-05,2e-05,2.1e-05,...,1.1e-05,5e-06,7.67e-06,1.1e-05,8.81e-06,1.2e-05,1.8e-05,8e-06,1.5e-05,4.41e-06


### extract labels

In [15]:
from util import get_labels
y = get_labels(data)
print(y.shape)

[0 0 0 0 0]
(1072,)


In [16]:
# from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score

def report_metrics(y_pred_prob, y_test, model_name):
    y_pred = y_pred_prob.argmax(axis=1)
    # Calculating AUC
    auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr', average=None)
    print(f"AUC: {auc}")
    # Calculating confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    # Calculating sensitivity and specificity for each class
    num_classes = 8
    sensitivity = np.zeros(num_classes)
    specificity = np.zeros(num_classes)
    for i in range(num_classes):
        true_positive = conf_matrix[i, i]
        false_negative = np.sum(conf_matrix[i, :]) - true_positive
        false_positive = np.sum(conf_matrix[:, i]) - true_positive
        true_negative = np.sum(conf_matrix) - true_positive - false_positive - false_negative
        sensitivity[i] = true_positive / (true_positive + false_negative)
        specificity[i] = true_negative / (true_negative + false_positive)

    # Calculating precision, recall, and F1 score
    precision = precision_score(y_test, y_pred, average=None)
    recall = recall_score(y_test, y_pred, average=None)
    f1 = f1_score(y_test, y_pred, average=None)

    accuracy = accuracy_score(y_test, y_pred)
    # Reporting the evaluation metrics
    for i in range(num_classes):
        print(f"Class {i+1} - Sensitivity: {sensitivity[i]}, Specificity: {specificity[i]}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Averaged sensitivity: {np.mean(sensitivity)}")
    print(f"Averaged specificity: {np.mean(specificity)}")
    newd = {'Models': model_name,
            'AUC': auc,
            'Avg. Sensitivity': np.mean(sensitivity),
            'Avg. Specificity': np.mean(specificity),
            'PPV (Recall)': np.mean(recall),
            'NPV (Precision)': np.mean(precision),
            'Accuracy': np.mean(accuracy),
            'F1 Score': f1
            }
    return newd


In [17]:
# y_fold_val

In [18]:
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
import warnings
warnings.simplefilter("ignore")
X = data.drop(columns=['group', 'label'])
# kf = KFold(n_splits=5)
kf = StratifiedKFold(n_splits=5)
for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    x_fold_train, x_fold_val, y_fold_train, y_fold_val = X.iloc[train_index], X.iloc[test_index], y[train_index], y[test_index]
    # Creating LightGBM datasets
    train_data = lgb.Dataset(x_fold_train, label=y_fold_train)
    val_data = lgb.Dataset(x_fold_val, label=y_fold_val)

    # Setting up the parameters for the classifier
    params = {
        'objective': 'multiclass',
        'num_class': 8
    }

    # Training the classifier
    model = lgb.train(params, train_data,
                    #   valid_sets=[train_data, val_data],
                    callbacks=[lgb.log_evaluation(10)])

    # Making predictions on the validation set
    y_pred_proba = model.predict(x_fold_val)
    print(f'**********fold--{i}**********')
    print()
    report_metrics(y_pred_proba, y_fold_val, f'fold {i}')
    # break

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102735 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182276
[LightGBM] [Info] Number of data points in the train set: 857, number of used features: 716
[LightGBM] [Info] Start training from score -1.965946
[LightGBM] [Info] Start training from score -2.008506
[LightGBM] [Info] Start training from score -1.965946
[LightGBM] [Info] Start training from score -2.371411
[LightGBM] [Info] Start training from score -2.026050
[LightGBM] [Info] Start training from score -1.991264
[LightGBM] [Info] Start training from score -2.476772
[LightGBM] [Info] Start training from score -1.965946
**********fold--0**********

AUC: [0.99351351 0.98683723 0.94234234 1.         0.99847212 0.99891892
 0.99351382 0.99351351]
Class 1 - Sensitivity: 0.9, Specificity: 0.9837837837837838
Class 2 - Sensitivity: 0.896551724137931, Specificity: 0.9731182795698925
Class 3 - Sensitivi

## 1. Lightgbm model

In [19]:
# # split data
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split

# # Splitting the data into features (X) and target (y)
# X = data.drop(columns=['group', 'label'], axis=1)

# # Splitting the data into train and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# print(X_train.shape, X_val.shape)

In [21]:
# # Creating LightGBM datasets
# train_data = lgb.Dataset(X_train, label=y_train)
# val_data = lgb.Dataset(X_val, label=y_val)

# # Setting up the parameters for the classifier
# params = {
#     'objective': 'multiclass',
#     'num_class': 8
# }

# # Training the classifier
# model = lgb.train(params, train_data,
#                 #   valid_sets=[train_data, val_data],
#                   callbacks=[lgb.log_evaluation(10)])

# # Making predictions on the validation set
# y_pred = model.predict(X_val)

In [22]:
# y_pred.shape

In [23]:
# # calculate auc
# from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, classification_report, precision_score, recall_score, f1_score
# # Making predictions on the validation set
# y_pred_prob = model.predict(X_val)
# y_pred = y_pred_prob.argmax(axis=1)

# # Calculating AUC
# auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
# print(f"AUC: {auc}")
# # Calculating confusion matrix
# conf_matrix = confusion_matrix(y_val, y_pred)

In [24]:
# conf_matrix

In [25]:
# results = pd.read_csv('allmodels.csv')

In [26]:
# results = results.drop(columns=['Unnamed: 0'])
# results

In [27]:
# # Calculating sensitivity and specificity for each class
# num_classes = 8
# sensitivity = np.zeros(num_classes)
# specificity = np.zeros(num_classes)
# for i in range(num_classes):
#     true_positive = conf_matrix[i, i]
#     false_negative = np.sum(conf_matrix[i, :]) - true_positive
#     false_positive = np.sum(conf_matrix[:, i]) - true_positive
#     true_negative = np.sum(conf_matrix) - true_positive - false_positive - false_negative
#     sensitivity[i] = true_positive / (true_positive + false_negative)
#     specificity[i] = true_negative / (true_negative + false_positive)

# # Calculating precision, recall, and F1 score
# precision = precision_score(y_val, y_pred, average='macro')
# recall = recall_score(y_val, y_pred, average='macro')
# f1 = f1_score(y_val, y_pred, average='macro')
# accuracy = accuracy_score(y_val, y_pred)
# # Reporting the evaluation metrics
# for i in range(num_classes):
#     print(f"Class {i+1} - Sensitivity: {sensitivity[i]}, Specificity: {specificity[i]}")
# print(f"Precision: {precision}")
# print(f"Recall: {recall}")
# print(f"F1 Score: {f1}")
# print(f"Averaged sensitivity: {np.mean(sensitivity)}")
# print(f"Averaged specificity: {np.mean(specificity)}")

# newd = {'Models': 'LGB',
#         'AUC': auc,
#         'Avg. Sensitivity': np.mean(sensitivity),
#         'Avg. Specificity': np.mean(specificity),
#         'PPV (Recall)': recall,
#         'NPV (Precision)': precision,
#         'Accuracy': accuracy,
#         'F1 Score': f1
#         }

In [28]:
# results = results.append(newd, ignore_index=True)
# results

In [29]:
# results.to_csv('allmodels.csv')