In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


import pandas as pd
from sklearn.metrics import precision_score, recall_score,\
    f1_score, balanced_accuracy_score, roc_auc_score, accuracy_score
import pickle
from src.mlPipeline.plotting import findBestModelPerDataset
import glob
import copy
from collections import defaultdict

In [3]:
model_pkl_path_17_18 = "data/oli_gwas_cross_validation/modelDictMultiDataRun_dataset_17_18_20190826.pkl"
with open(model_pkl_path_17_18, "rb") as pklFile:
    model_dict_17_18 = pickle.load(pklFile)
model_pkl_path_19_20_21 = "data/oli_gwas_cross_validation/modelDictMultiDataRun_dataset_19_20_21_20190826.pkl"
with open(model_pkl_path_19_20_21, "rb") as pklFile:
    model_dict_19_20_21 = pickle.load(pklFile)

In [4]:
dataset_prefix_list_1 = ['dataset_17_', 'dataset_18_']
dataset_prefix_list_2 = ['dataset_19_', 'dataset_20_', 'dataset_21_']

In [5]:
def read_in_data(data_path):
        trainPath = data_path  + "_train.csv"
        valPath = data_path  + "_test.csv"
        gwasPath = data_path  + "_gwas.csv"
        trainDF = pd.read_csv(trainPath)
        valDF = pd.read_csv(valPath)
        allData = pd.concat([trainDF, valDF])
        GWASDF = pd.read_csv(gwasPath)
        nTrain = trainDF.shape[0]
        nVal = valDF.shape[0]
        trainDF = trainDF.set_index("isolate")
        valDF = valDF.set_index("isolate")
        return(trainDF, valDF, GWASDF)

In [7]:
dataset_model_bootstrap_performance_dict = defaultdict(dict)

for dataset_prefix_list in [dataset_prefix_list_1, dataset_prefix_list_2]:
    for dataset_prefix in dataset_prefix_list:
        if dataset_prefix in dataset_prefix_list_1:
            model_dict_all = model_dict_17_18
            model_dict_id = "17-18"
        else:
            model_dict_all = model_dict_19_20_21
            model_dict_id = "19-20-21"
        print("running dataset: {}, with model_dict for datasets: {}".format(dataset_prefix, model_dict_id))
        for model_name, model_dict in model_dict_all[dataset_prefix].items():
            best_model = copy.deepcopy(model_dict["gridcv"].best_estimator_)
            for data_idx in range(0,10):
                data_path = "data/oli_gwas_cross_validation/" + \
                    dataset_prefix[:-1] + ".{}".format(data_idx)
                # Read in all data
                trainDF, valDF, gwasDF = read_in_data(data_path)
                # Split training a testing matrices
                X_train = trainDF.drop(labels = ["pbr_res"], axis = 1).values
                Y_train = trainDF["pbr_res"].values
                X_val = valDF.drop(labels = ["pbr_res"], axis = 1).values
                Y_val = valDF["pbr_res"].values
                # Refit model
                best_model.fit(X_train, Y_train)
                # Get metrics
                preds = best_model.predict(X_val)
                if model_name == "SVC" and not best_model.steps[1][1].probability:
                    scores = None
                    rocauc = None
                else:
                    scores = best_model.predict_proba(X_val)
                    rocauc = roc_auc_score(y_true = Y_val, y_score = scores[:,1])
                f1 = f1_score(y_true = Y_val, y_pred = preds)
                prec = precision_score(y_true = Y_val, y_pred = preds)
                rec = recall_score(y_true = Y_val, y_pred = preds)        
                bal_acc = balanced_accuracy_score(y_true = Y_val, y_pred = preds)
                acc = accuracy_score(y_true = Y_val, y_pred = preds)
                dataset_model_bootstrap_performance_dict\
                    [dataset_prefix + model_name]["f1_{}".format(data_idx)] = f1
                dataset_model_bootstrap_performance_dict\
                    [dataset_prefix + model_name]["prec_{}".format(data_idx)] = prec
                dataset_model_bootstrap_performance_dict\
                    [dataset_prefix + model_name]["rec_{}".format(data_idx)] = rec
                dataset_model_bootstrap_performance_dict\
                    [dataset_prefix + model_name]["balanced_acc_{}".format(data_idx)] = bal_acc
                dataset_model_bootstrap_performance_dict\
                    [dataset_prefix + model_name]["acc_{}".format(data_idx)] = acc
                dataset_model_bootstrap_performance_dict\
                    [dataset_prefix + model_name]["rocauc_{}".format(data_idx)] = rocauc

running dataset: dataset_17_, with model_dict for datasets: 17-18
running dataset: dataset_18_, with model_dict for datasets: 17-18
running dataset: dataset_19_, with model_dict for datasets: 19-20-21
running dataset: dataset_20_, with model_dict for datasets: 19-20-21
running dataset: dataset_21_, with model_dict for datasets: 19-20-21


In [8]:
out_path = "data/oli_gwas_cross_validation/bootstrap_results.csv"
results_df = pd.DataFrame.from_dict(dataset_model_bootstrap_performance_dict, orient = "index")
results_df.to_csv(out_path)

In [9]:
dataset_model_bootstrap_performance_dict

defaultdict(dict,
            {'dataset_17_logistic': {'f1_0': 0.5,
              'prec_0': 0.3793103448275862,
              'rec_0': 0.7333333333333333,
              'balanced_acc_0': 0.7191256830601093,
              'acc_0': 0.7105263157894737,
              'rocauc_0': 0.7770491803278688,
              'f1_1': 0.6666666666666667,
              'prec_1': 0.5333333333333333,
              'rec_1': 0.8888888888888888,
              'balanced_acc_1': 0.8922056384742951,
              'acc_1': 0.8947368421052632,
              'rocauc_1': 0.8822553897180763,
              'f1_2': 0.45454545454545453,
              'prec_2': 0.45454545454545453,
              'rec_2': 0.45454545454545453,
              'balanced_acc_2': 0.6811188811188811,
              'acc_2': 0.8421052631578947,
              'rocauc_2': 0.6496503496503496,
              'f1_3': 0.5806451612903226,
              'prec_3': 0.5294117647058824,
              'rec_3': 0.6428571428571429,
              'balanced_acc_3': 