In [1]:
import lightgbm as lgb
import numpy as np # linear algebra
import pandas as pd 
from pytorch_tabular.models import *
from pytorch_tabular.config import (
    DataConfig, 
    OptimizerConfig,
    TrainerConfig,
)
from sklearn.model_selection import cross_validate # allows multiple metrics 
import seaborn as sns
from pytorch_tabular import TabularModel
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *

In [2]:
X = pd.read_csv("../X_with_Ids.csv")
y = pd.read_csv("../y_for_XwithIds.csv", index_col=False)
X.drop("ProteinID", inplace=True, axis = 1)

In [3]:
def F1_max_calc(y_true, y_proba1):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba1)
    numerator = 2 * recall * precision
    denom = recall + precision
    f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom!=0))
    max_f1 = np.max(f1_scores)
    # max_f1_thresh = thresholds[np.argmax(f1_scores)]
    return max_f1

def APS_calc(y_true, y_pred):
    return average_precision_score(y_true, y_pred)

def AUC_calc(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

In [4]:
data_config = DataConfig(
    target=["Target"],
    continuous_cols= X.columns.to_list()
    )

trainer_config = TrainerConfig(
        # Lowered batch size from 1024 for FTTransformer
        batch_size=1024,
        max_epochs=20,
    )
optimizer_config = OptimizerConfig()

gandalf_config = GANDALFConfig(
        task="classification"
    )
gandalf = TabularModel(
        data_config=data_config,
        model_config=gandalf_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        verbose=False
    )

In [5]:
LGBMModel = lgb.LGBMClassifier()

In [52]:
models = ((LGBMModel, "LGBM"), (gandalf, "gandalf"))
kf = StratifiedKFold(n_splits=10)

the_split = kf.split(X, y)
datamodule = None
currentModel = None

In [30]:
for train_index, val_index in the_split:
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    print(X_train.head())

In [63]:
def cross_validate(model, name):
    datamodule = None
    f1_metrics = []
    aps_metrics = []
    auc_metrics = []
    scores = {}
    for train_index, val_index in the_split:
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        y_pred = []
        if name == "LGBM":
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
        else:
            if datamodule is None:
                # Initialize datamodule and model in the first fold
                # uses train data from this fold to fit all transformers
                datamodule = model.prepare_dataloader(train=X_train, validation=X_val)
                currentModel = model.prepare_model(datamodule)
            else:
                datamodule = datamodule.copy(train=X_train, validation=X_val)
            model.train(currentModel, datamodule)
            y_pred = model.predict()
        f1stats = F1_max_calc(y_val, y_pred)
        print("f1 stats:", f1stats)
        f1_metrics.append(F1_max_calc(y_val, y_pred))
        aps_metrics.append(APS_calc(y_val, y_pred))
        auc_metrics.append(AUC_calc(y_val, y_pred))
    scores["f1"] = f1_metrics
    scores["aps"] = aps_metrics
    scores["auc"] = auc_metrics
    return scores

In [64]:
for (model, name) in models:
    print(name)
    results = cross_validate(model, name)
    print(results)

LGBM
{'f1': [], 'aps': [], 'auc': []}
gandalf
{'f1': [], 'aps': [], 'auc': []}
