# Whole-genome analysis workflow

In [1]:
# ~2 minutes to install 
#%pip install -U --no-cache-dir scikit-learn scikit-optimize prefect prefect-ray ray plotly openpyxl shap lion_pytorch pytorch_tabnet xgboost neptune pyspark pyarrow dill fastnumbers

In [2]:
from prefect import task, flow
from prefect.task_runners import ConcurrentTaskRunner
from prefect_ray.task_runners import RayTaskRunner
from DillSerializer import DillSerializer
import ray

import pandas as pd
import numpy as np

import logging

!export PREFECT_LOGGING_LEVEL="WARNING"
ray.shutdown()
parallelRunner = ray.init(
  configure_logging=True,
  logging_level=logging.ERROR,
)
parallelRunner

0,1
Python version:,3.10.10
Ray version:,2.3.1


In [3]:
from sklearn.ensemble import (
    AdaBoostClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from lion_pytorch import Lion

from skopt.space import Categorical, Integer, Real

from env import neptune_api_token

RadialBasisSVC = SVC
RadialBasisSVC.__name__ = "RadialBasisSVC"

clearHistory = True
config = {
  'vcfLike': {  
    'path': 'Variant_report_NUPs_fixed_2022-03-28.xlsx',             # variant call table with annotations
    'sheet': "all cases vs all controls",                             # sheet name if Excel spreadsheet
    'indexColumn': ['chrom', 'position', 'Gene'],       # header that indexes variants (set as list with multiple columns)
    'binarize': True,                           # binarize variants to 0/1, or sum to weigh allele frequency
    'minAlleleFrequency': 0.05,           # filter out variants with allele frequency less than this
  # 'alleleModel': ['dominant', 'recessive', 'overDominant'],  # biallelic allele models to test on gene sets
    'filters': {
      } 
  }, # TODO handle genotypes from related individuals
  
  'geneSets' : { # TODO gene sets
      },
  
  'tracking': {
    'name': 'Nucleoporin genes, well-classified cases', # name of the experiment
    'entity': 'ejmockler',
    'project': 'ALS-NUPs-NoHyperParamOptimization',
    'plotAllSampleImportances': True,  # if calculating Shapely explanations, plot each sample in Neptune
    'token': neptune_api_token
  },

  'clinicalTable': {
      'path': 'ACWM.xlsx',                      # clinical data as Excel spreadsheet
      'idColumn': 'ExternalSampleId',           # genotype ID header
      'uniqueIdColumn': 'ExternalSubjectId',    # unique ID for each patient
      'labelColumn': 'Subject Group',                # header that has case/control labels
      'controlLabels': ['Non-Neurological Control'], # these labels include external sample IDs (like 1000 Genomes)
      'caseLabels': ['ALS Spectrum MND'],
      'controlAlias': 'control',
      'caseAlias': 'case',
      'filters': 'pct_european>=0.85',             # filter out nonhomogenous samples with less than 85% European ancestry
  },

  'externalTables': {
      'path': ['igsr-1000 genomes phase 3 release.tsv'],  # external sample table
      'label': ['control'], # case | control | mixed (mixed labels are held out as an external test set)
      'idColumn': ['Sample name'],                        # sample ID header
      'filters': ["`Superpopulation code`=='EUR' & `Population name`!='Finnish'"], # remove finnish samples due to unusual homogeneity (verify w/ PCA)
  },

  'sampling': {
    'bootstrapIterations': 60, 
    'crossValIterations': 10,   # number of validations per bootstrap iteration
    'holdoutSplit': 0.1,
  },
  
  'model': {
    'stack': {
      LinearSVC(): {
            "tol": Real(1e-6, 1e+1, prior="log-uniform"),
            "C": Real(1e-4, 1e+1, prior="log-uniform"),
        },
      RadialBasisSVC(probability=True, kernel="rbf"): {
          "tol": Real(1e-4, 1e+1, prior="log-uniform"),
          "C": Real(1e-4, 1e+1, prior="log-uniform"),
          "gamma": Categorical(["scale", "auto"]),
      },
      LogisticRegression(penalty="l2", solver="saga"): {
          "tol": Real(1e-6, 1e+1, prior="log-uniform"),
          "C": Real(1e-4, 1e+1, prior="log-uniform"),
      },
      # TabNetClassifier: {
      #     "n_d": Integer(8, 64),
      #     "n_a": Integer(8, 64),
      #     "n_steps": Integer(3, 10),
      #     "lambda_sparse": Real(1e-4, 1e+1, prior="log-uniform"),
      # },
      MultinomialNB(): {"alpha": Real(1e-10, 1e+1, prior="log-uniform")},
      AdaBoostClassifier(): {
          "n_estimators": Integer(25, 75),
          "learning_rate": Real(1e-6, 1e+1, prior="log-uniform"),
      },
      XGBClassifier(): {
          "learning_rate": Real(1e-6, 1e+1, prior="log-uniform"),
          "n_estimators": Integer(10, 100),
      },
      RandomForestClassifier(): { 
          "n_estimators": Integer(75, 200),
      },
    },
    'hyperparameterOptimization': False,
    'calculateShapelyExplanations': False,
  }
}
 
async def remove_all_flows():
  from prefect.client import get_client
  orion_client = get_client()
  flows = await orion_client.read_flows()
  for flow in flows:
    flow_id = flow.id
    print(f"Deleting flow: {flow.name}, {flow_id}")
    await orion_client._client.delete(f"/flows/{flow_id}")
    print(f"Flow with UUID {flow_id} deleted")

if clearHistory: await remove_all_flows()

In [4]:
from prefect import unmapped
from tqdm import tqdm

@task()
def filterTable(table, filterString):
    if not filterString: return table
    print(f"Filtering: {filterString}")
    filteredTable = table.query(filterString)
    return filteredTable

@task()
def applyAlleleModel(values, columns, genotypeIDs):
    # some genotype IDs are subset of column names (or vice versa)
    genotypeDict = dict()
    resolvedGenotypeIDs = set()
    for id in tqdm(genotypeIDs, unit='id'):
        for j, column in enumerate(columns):
            if id in column or column in id:
                # implement allele model
                genotypeDict[f"{column}"] = [(
                                np.sum([int(allele) for allele in genotype.replace("'", "").split("/")]) # split by allele delimiter 
                                if not config["vcfLike"]["binarize"] 
                                else np.clip(
                                        np.sum([int(allele) for allele in genotype.replace("'", "").split("/")]), 
                                        a_max=1, a_min=None)
                                )
                            if any(char.isdigit() for char in genotype)
                            else np.nan
                            for genotype in values[:,j]
                        ]
                columns = np.delete(columns, j)
                values = np.delete(values, j, axis=1)
                resolvedGenotypeIDs.update({id})
                break
    missingGenotypeIDs = set(genotypeIDs) - resolvedGenotypeIDs  # leftover columns are missing
    return genotypeDict, missingGenotypeIDs, resolvedGenotypeIDs

@task()
def load():
    clinicalData = pd.read_excel(config['clinicalTable']['path'], index_col=config['clinicalTable']['idColumn']
                                ).drop_duplicates(subset=config['clinicalTable']['uniqueIdColumn'])
    externalSamples = [pd.read_csv(path, sep='\t', index_col=idColumn) for path, idColumn in zip(config['externalTables']['path'], config['externalTables']['idColumn'])]
    annotatedVCF = pd.read_csv(
        config['vcfLike']['path'], sep='\t', dtype=str, index_col=config['vcfLike']['indexColumn'], 
        ) if "xlsx" not in config['vcfLike']['path'] else pd.read_excel(
            config['vcfLike']['path'], sheet_name=(config['vcfLike']['sheet'] if config['vcfLike']['sheet'] else None), 
            dtype=str, na_values=['.'], keep_default_na=False
        )
    # remove null chromosome positions
    annotatedVCF[config['vcfLike']['indexColumn']] = annotatedVCF[config['vcfLike']['indexColumn']].astype(str).replace('', np.nan)
    return clinicalData, externalSamples, annotatedVCF.dropna(subset=config['vcfLike']['indexColumn']).set_index(config['vcfLike']['indexColumn'])

@flow(task_runner=ConcurrentTaskRunner(), log_prints=True)
async def processInputFiles():
    clinicalData, externalSamples, annotatedVCF = load()
    
    filteredClinicalData = filterTable(clinicalData, config['clinicalTable']['filters'])
    print(f"filtered {len(clinicalData) - len(filteredClinicalData)} samples from clinical data")
    filteredExternalSamples = [filterTable(externalSampleTable, filterString) for externalSampleTable, filterString in zip(externalSamples, config['externalTables']['filters'])]
    for i, (externalSampleTable, path) in enumerate(zip(filteredExternalSamples, config['externalTables']['path'])):
        print(f"filtered {len(externalSamples[i]) - len(externalSampleTable)} samples from external data {path}")
    filteredVCF = filterTable(annotatedVCF, config['vcfLike']['filters'])
    print(f"filtered {annotatedVCF.shape[0] - filteredVCF.shape[0]} variants from VCF")
    
    caseIDsMask, controlIDsMask = [
        filteredClinicalData[config['clinicalTable']['labelColumn']].isin(labels).dropna()
        for labels in (config['clinicalTable']['caseLabels'], config['clinicalTable']['controlLabels'])]
    
    caseIDs = caseIDsMask[caseIDsMask].index.to_numpy()
    controlIDs = controlIDsMask[controlIDsMask].index.to_numpy()
    for i, label in enumerate(config['externalTables']['label']):
        if label == config['clinicalTable']['caseAlias']:
            caseIDs = np.append(caseIDs, filteredExternalSamples[i].index.to_numpy())
        elif label == config['clinicalTable']['controlAlias']:
            controlIDs = np.append(controlIDs, filteredExternalSamples[i].index.to_numpy())

    # cast genotypes as numeric, drop chromosome positions with missing values
    caseGenotypeFutures, controlGenotypeFutures = applyAlleleModel.map(
            unmapped(filteredVCF.to_numpy()),
            unmapped(filteredVCF.columns.to_numpy()), 
            genotypeIDs=[IDs for IDs in (caseIDs, controlIDs)]
        ) 
    caseGenotypeDict, missingCaseIDs, resolvedCaseIDs = caseGenotypeFutures.result()
    controlGenotypeDict, missingControlIDs, resolvedControlIDs = controlGenotypeFutures.result()
    
    if len(missingCaseIDs) > 0 or len(missingControlIDs) > 0:
        for alias, IDs in {"caseAlias": missingCaseIDs, "controlAlias": missingControlIDs}.items():
            print(f"\nmissing {len(IDs)} {config['clinicalTable'][alias]} IDs:\n {IDs}")
    caseGenotypes = pd.DataFrame.from_dict(caseGenotypeDict)
    caseGenotypes.index.name = filteredVCF.index.name
    caseGenotypes.index = filteredVCF.index
    controlGenotypes = pd.DataFrame.from_dict(controlGenotypeDict)
    controlGenotypes.index.name = filteredVCF.index.name
    controlGenotypes.index = filteredVCF.index
    
    caseIDs = resolvedCaseIDs
    controlIDs = resolvedControlIDs
    
    print(f"\n{len(caseIDs)} cases:\n {caseIDs}")
    print(f"\n{len(controlIDs)} controls:\n {controlIDs}")
    # filter allele frequencies
    allGenotypes = pd.concat([caseGenotypes.dropna(how='any', axis=0), controlGenotypes.dropna(how='any', axis=0)], axis=1)
    filteredGenotypes = allGenotypes.loc[
        allGenotypes.gt(0).sum(axis=1).divide(len(allGenotypes.columns)) >= config['vcfLike']['minAlleleFrequency']]
    print(f"Filtered {len(filteredVCF) - len(filteredGenotypes)} alleles with frequency below {'{:.3%}'.format(config['vcfLike']['minAlleleFrequency'])}")
    print(f"Kept {len(filteredGenotypes)} alleles")
    
    caseGenotypes = filteredGenotypes.loc[:,caseGenotypes.columns]
    controlGenotypes = filteredGenotypes.loc[:,controlGenotypes.columns]
    
    return [caseGenotypes, caseIDs, controlGenotypes, controlIDs, filteredClinicalData]

caseGenotypes, caseIDs, controlGenotypes, controlIDs, clinicalData = await processInputFiles()
print(f"\nclinical data:\n{clinicalData.head()}")

  warn(msg)


100%|██████████| 922/922 [01:11<00:00, 12.85id/s]



100%|██████████| 2904/2904 [02:23<00:00, 20.17id/s][A



clinical data:
                       Quote    Data File ID ExternalSubjectId   
ExternalSampleId                                                 
CGND-HDA-05557    CGND_14852  CGND-HDA-05557       NEUUF013XXL  \
CGND-HDA-05556    CGND_14852  CGND-HDA-05556       NEUHM496PGR   
CGND-HDA-05555    CGND_14852  CGND-HDA-05555       NEUPK599KHH   
CGND-HDA-05554    CGND_14852  CGND-HDA-05554       NEUHD589CVP   
CGND-HDA-05553    CGND_14852  CGND-HDA-05553       NEUXX223WT8   

                              Project     Site Sample Collected   
ExternalSampleId                                                  
CGND-HDA-05557    ALS Natural History  Henry Ford Health System  \
CGND-HDA-05556    ALS Natural History  Henry Ford Health System   
CGND-HDA-05555    ALS Natural History  Henry Ford Health System   
CGND-HDA-05554    ALS Natural History  Henry Ford Health System   
CGND-HDA-05553    ALS Natural History  Henry Ford Health System   

                   Site Specimen Collected     Sex 

## Evaluate model stack

In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibrationDisplay
from sklearn.metrics import RocCurveDisplay, roc_auc_score, auc
from sklearn.preprocessing import MinMaxScaler
from skopt.plots import plot_convergence

from skopt import BayesSearchCV

from fastnumbers import check_real

from types import SimpleNamespace
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

import asyncio

import neptune
from neptune.types import File

import shap

from inspect import isclass
from io import StringIO
import traceback

# stop API errors when awaiting results
# !prefect config set PREFECT_RESULTS_PERSIST_BY_DEFAULT=True

@task()
def getFeatureImportances(model, data, featureLabels):
  """Get feature importances from fitted model and create SHAP explainer"""
  if model.__class__.__name__ == "MultinomialNB":
    modelCoefficientDF = pd.DataFrame()
    for i, c in enumerate(model.feature_count_[0] if len(model.feature_count_.shape) > 1 else model.feature_count_):
      modelCoefficientDF.loc[i, f"feature_importances_{config['clinicalTable']['controlAlias']}"] = model.feature_log_prob_[0][i]
      modelCoefficientDF.loc[i, f"feature_importances_{config['clinicalTable']['caseAlias']}"] = model.feature_log_prob_[1][i]
  elif hasattr(model, "coef_"):
    modelCoefficientDF = pd.DataFrame()
    if len(model.coef_.shape) > 1:
      try:
        modelCoefficientDF[f"feature_importances_{config['clinicalTable']['controlAlias']}"] = model.coef_[0]
        modelCoefficientDF[f"feature_importances_{config['clinicalTable']['caseAlias']}"] = model.coef_[1]
      except IndexError:
        modelCoefficientDF[f"feature_importances"] = model.coef_[0]                 
    else:
      modelCoefficientDF[f"feature_importances"] = model.coef_[0]
  elif hasattr(model, "feature_importances_"):
    modelCoefficientDF = pd.DataFrame()
    modelCoefficientDF[f"feature_importances"] = model.feature_importances_
  else:
    modelCoefficientDF = None

  if type(modelCoefficientDF) == pd.DataFrame:
      modelCoefficientDF.index = featureLabels
      modelCoefficientDF.index.name = "features"
      
  if config['model']['calculateShapelyExplanations']:
    # Cluster correlated and hierarchical features using masker
    masker = shap.maskers.Partition(data, clustering="correlation")
    shapExplainer = shap.explainers.Permutation(
      model.predict_proba if hasattr(model, "predict_proba") 
      else model.predict, 
      masker, 
      feature_names=["_".join(label)
        for label in featureLabels])
    shapValues = shapExplainer(data)
  else:
    shapExplainer = None
    shapValues = None
    masker = None
  return modelCoefficientDF, shapValues, shapExplainer, masker


@task()
def plotCalibration(title, labelsPredictionsByInstance):
  # code from https://scikit-learn.org/stable/auto_examples/calibration/plot_calibration_curve.html
  fig, ax_calibration_curve = plt.subplots(figsize=(10, 10))
  colors = plt.cm.get_cmap("Dark2")

  calibration_displays = {}
  for i, (name, (labels, predictions)) in enumerate(labelsPredictionsByInstance.items()):
      display = CalibrationDisplay.from_predictions(
          [config["clinicalTable"]["caseAlias"] if label == 1 else label 
            for label in labels],
          predictions,
          pos_label=config["clinicalTable"]["caseAlias"],
          n_bins=10,
          name=name,
          ax=ax_calibration_curve,
          color=colors(i),
      )
      calibration_displays[name] = display

  ax_calibration_curve.grid()
  ax_calibration_curve.set_title(title)

  # Add histogram
  # grid_positions = [(i+2,j) for i in range(len(predictionsByModelName.keys())//2) for j in range(2)]
  # for i, modelName in enumerate(predictionsByModelName.keys()):
  #     row, col = grid_positions[i]
  #     ax = fig.add_subplot(gs[row, col])
  #     ax.hist(
  #         calibration_displays[modelName].y_prob,
  #         range=(0, 1),
  #         bins=10,
  #         label=modelName,
  #         color=colors(i),
  #     )
  #     ax.set(title=modelName, xlabel="Mean predicted probability", ylabel="Count")

  plt.tight_layout()
  return fig

@task()
def plotAUC(title, labelsPredictionsByInstance):
  # trace AUC for each set of predictions
  tprs = []
  aucs = []
  mean_fpr = np.linspace(0, 1, 100)

  fig, ax = plt.subplots(figsize=(10, 10))
  for name, (labels, predictions) in labelsPredictionsByInstance.items():
    # plot ROC curve for this fold
    viz = RocCurveDisplay.from_predictions([
                          config["clinicalTable"]["caseAlias"] if label == 1 else label 
                          for label in labels], 
                        predictions,
                        name=name,
                        pos_label=config["clinicalTable"]["caseAlias"],
                        alpha=0.6, lw=2, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
    
  # summarize ROCs per fold and plot standard deviation
  ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
      label='Chance', alpha=.8)
  mean_tpr = np.mean(tprs, axis=0)
  mean_tpr[-1] = 1.0
  mean_auc = auc(mean_fpr, mean_tpr)
  std_auc = np.std(aucs)
  ax.plot(mean_fpr, mean_tpr, color='b',
          label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
          lw=4, alpha=.8)
  std_tpr = np.std(tprs, axis=0)
  tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
  tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
  ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                  label=r'$\pm$ 1 std. dev.')

  ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
        title=title)
  ax.legend(loc="lower right")
  ax.set(title=title)
  return fig


@task()
def plotConfusionMatrix():
  pass

@task()
def plotSampleAccuracy():
  pass

@task()
def plotOptimizer(title, resultsByInstance):
  # code from https://scikit-learn.org/stable/auto_examples/calibration/plot_calibration_curve.html
  fig = plt.figure(figsize=(10, 10))
  gs = GridSpec(2, 2)
  colors = plt.cm.get_cmap("Dark2")
  ax_convergence = fig.add_subplot(gs[:2, :2])
  plot_convergence(*[
    (modelName, result) for modelName, result in resultsByInstance.items()], 
    ax=ax_convergence, color=colors)
  ax_convergence.set(title=title)
  plt.tight_layout()
  return fig

@task()
def prepareDatasets(caseGenotypes, controlGenotypes, verbose=True):
    caseIDs = caseGenotypes.columns
    controlIDs = controlGenotypes.columns
    # store number of cases & controls
    caseControlCounts = [len(caseIDs), len(controlIDs)]
    # determine which has more samples
    labeledIDs = [caseIDs, controlIDs]
    majorIDs = labeledIDs[np.argmax(caseControlCounts)]
    minorIDs = labeledIDs[np.argmin(caseControlCounts)]
    # downsample larger group to match smaller group
    majorIndex = np.random.choice(np.arange(len(majorIDs)), min(caseControlCounts), replace=False)
    
    excessMajorIDs, balancedMajorIDs = [], []
    for index, id in enumerate(majorIDs):
      if index in majorIndex:
        balancedMajorIDs.append(id)
      else:
        excessMajorIDs.append(id)
    
    allGenotypes = pd.concat([caseGenotypes, controlGenotypes], axis=1)
    
    genotypeExcessIDs, crossValGenotypeIDs = [], []
    # match IDs between genotype and clinical data; dataframe labels have label suffixes
    unmatchedTrainIDs = balancedMajorIDs + minorIDs
    for label in tqdm(allGenotypes.columns, desc="Matching IDs", unit="ID"):
      for setType in ["excess", "train"]:
        idSet = excessMajorIDs if setType == "excess" else unmatchedTrainIDs
        for i, id in enumerate(idSet):
          if (id in label) or (label in id): 
            if setType == "train":
              if label not in crossValGenotypeIDs: crossValGenotypeIDs.append(label)
            elif setType == "excess":
              if label not in genotypeExcessIDs: genotypeExcessIDs.append(label)
            idSet = np.delete(idSet, i)
            break
          
    if verbose:
      print(f"\n{len(crossValGenotypeIDs)} for training:\n{crossValGenotypeIDs}")
      print(f"\n{len(genotypeExcessIDs)} are excess:\n{genotypeExcessIDs}")
      print(f"\nVariant count: {len(allGenotypes.index)}")
      
    samples = allGenotypes.loc[:, crossValGenotypeIDs].dropna(how='any')  # drop variants with missing values
    excessMajorSamples = allGenotypes.loc[:, genotypeExcessIDs]

    variantIndex = list(samples.index)
    pass
    scaler = MinMaxScaler()
    embedding = {
        'sampleIndex': crossValGenotypeIDs,
        'labels': np.array([1 if id in caseIDs else 0 for id in crossValGenotypeIDs]),
        'samples': scaler.fit_transform(samples).transpose(), # samples are now rows (samples, variants)
        'excessMajorIndex': genotypeExcessIDs,
        'excessMajorLabels': [1 if id in caseIDs else 0 for id in genotypeExcessIDs],
        'excessMajorSamples': scaler.fit_transform(excessMajorSamples).transpose(),
        'variantIndex': variantIndex,
    }
    return embedding
  
@task()
def optimizeHyperparameters(samples, labels, model, parameterSpace, cvIterator, metricFunction, n_jobs=1):
  # hyperparameter search (inner cross-validation)
  optimizer = BayesSearchCV(
    model, parameterSpace, cv=cvIterator, n_jobs=n_jobs, n_points=4,
    return_train_score=True, n_iter=100, scoring=metricFunction)
  # train / optimize parameters
  optimizer.fit(samples, labels)
  return optimizer

def serializeDataFrame(dataframe):
  stream = StringIO()
  dataframe.to_csv(stream)
  return File.from_stream(stream, extension='csv')

@task()
def beginTracking(model, runNumber, embedding, clinicalData, deserializedIDs):
  runTracker = neptune.init_run(project=f'{config["tracking"]["entity"]}/{config["tracking"]["project"]}', api_token=config['tracking']['token'])
  runTracker['sys/tags'].add(model.__class__.__name__)
  runTracker['bootstrapIteration'] = runNumber+1
  runTracker["config"] = {key: (item if check_real(item) or isinstance(item, str) else str(item)) for key, item in config.items()}
  
  runTracker['embedding'].upload(serializeDataFrame(pd.DataFrame(data=embedding["samples"], columns=embedding["variantIndex"], index=embedding["sampleIndex"])))
  runTracker['clinicalData'].upload(serializeDataFrame(clinicalData.loc[clinicalData.index.isin(deserializedIDs)]))
  
  runTracker['nVariants'] = len(embedding["variantIndex"])
  runID = runTracker["sys/id"].fetch()
  runTracker.stop()
  return runID

@task()
def trackResults(runID, current):
  runTracker = neptune.init_run(project=f'{config["tracking"]["entity"]}/{config["tracking"]["project"]}', with_id=runID, api_token=config['tracking']['token'])
  if config['model']['hyperparameterOptimization']:
      runTracker["modelParams"] = {k+1: current["fittedOptimizers"][k].best_params_ for k in range(config['sampling']['crossValIterations'])}

  runTracker["sampleResults"].upload(serializeDataFrame(pd.DataFrame.from_dict({
      "probability": [probability[1] for foldResults in current["probabilities"] for probability in foldResults], 
      "id": [id for foldResults in current["testIDs"] for id in foldResults]}, dtype=object).set_index("id")))
  
  if config['model']['calculateShapelyExplanations']:
    runTracker['shapExplanationsPerFold'].upload(File.as_pickle(current["localExplanations"]))
    runTracker['shapExplainersPerFold'].upload(File.as_pickle(current["shapExplainers"]))
    runTracker['shapMaskersPerFold'].upload(File.as_pickle(current["shapMaskers"]))
    runTracker["featureImportance/shapelyExplanations/average"].upload(serializeDataFrame(current['averageShapelyValues']))
 
  if current["globalExplanations"][0] is not None:
    runTracker[f"featureImportance/modelCoefficients/average"].upload(
      serializeDataFrame(current["averageGlobalExplanations"]))
 
  for k in range(config['sampling']['crossValIterations']):
    runTracker[f'trainIDs/{k+1}'].upload(serializeDataFrame(pd.Series(current["trainIDs"][k])))
    runTracker[f'testIDs/{k+1}'].upload(serializeDataFrame(pd.Series(current["testIDs"][k])))
    runTracker[f'testLabels/{k+1}'].upload(serializeDataFrame(pd.Series(current["testLabels"][k])))
    runTracker[f'trainLabels/{k+1}'].upload(serializeDataFrame(pd.Series(current["trainLabels"][k])))
    if current["globalExplanations"][k] is not None:
      runTracker[f"featureImportance/modelCoefficients/{k+1}"].upload(serializeDataFrame(current["globalExplanations"][k]))
    if config['model']['calculateShapelyExplanations']:
      runTracker[f"featureImportance/shapelyExplanations/{k+1}"].upload(serializeDataFrame(pd.DataFrame.from_dict({
        "feature_name": [name for name in current['localExplanations'][0].feature_names],
        "value": [np.mean(current['localExplanations'][k].values[featureIndex])
          for featureIndex in range(len(current['localExplanations'][0].feature_names))]
        }, dtype=object).set_index("feature_name")))
  
  runTracker["meanAUC"] = np.mean(current["testAUC"])
  # average sample count across folds
  runTracker['nTrain'] = np.mean([len(idList) for idList in current["trainIDs"]])
  runTracker['nTest'] = np.mean([len(idList) for idList in current["testIDs"]])
  runTracker.stop()
      
# parallel task runner patch https://github.com/PrefectHQ/prefect/issues/7319
# TODO build task runners only 
async def build_subflow(name, args):
  
  if name == 'classify':
    @flow(
      task_runner=ConcurrentTaskRunner())
    async def classify(runNumber, innerCvIterator, outerCvIterator):
      embedding = prepareDatasets(caseGenotypes, controlGenotypes, verbose=(True if runNumber == 0 else False))
      deserializedIDs = list()
      for id in embedding['sampleIndex']:
        deserializedIDs.extend(id.split("__"))
      totalSampleCount = len(embedding['samples'])
      caseCount = np.count_nonzero(embedding['labels'])
      print(f"{totalSampleCount} samples\n")
      print(f"{caseCount} cases\n")
      print(f"{totalSampleCount - caseCount} controls\n")
      results = {}
      results['samples'] = {}
      results['labels'] = {}
      results['models'] = {}
      trainIDs = set()
      testIDs = set()
      for(model, parameterSpace) in list(config['model']['stack'].items())[:1]:
        current = {}
        # check if model is initialized
        if isclass(model):
          if model.__name__ == 'TabNetClassifier':
            model = model(verbose=False, optimizer_fn=Lion)
        print(f"Iteration {runNumber+1} with model {model.__class__.__name__}")
        runID = beginTracking(model, runNumber, embedding, clinicalData, deserializedIDs)
        # outer cross-validation
        crossValIndices = np.array([
            (cvTrainIndices, cvTestIndices) 
            for (cvTrainIndices, cvTestIndices) in outerCvIterator.split(embedding['samples'], embedding['labels'])])
        current["trainIndices"] = crossValIndices[:,0]
        current["testIndices"] = crossValIndices[:,1]
        trainIDs.update(*[np.array(embedding['sampleIndex'])[indices] for indices in current["trainIndices"]])
        testIDs.update(*[np.array(embedding['sampleIndex'])[indices] for indices in current["testIndices"]])
        outerCrossValResults = zip(*await asyncio.gather(
          *[build_subflow('evaluate', (
              trainIndices, testIndices, model, embedding['labels'], embedding['samples'], embedding['variantIndex'], embedding['sampleIndex'], parameterSpace, innerCvIterator))
            for trainIndices, testIndices in zip(current["trainIndices"], current["testIndices"])
          ]))
        resultNames = ["globalExplanations", "localExplanations", "probabilities", "predictions", "testLabels", "trainLabels", "trainIDs", "testIDs", "fittedOptimizers", "shapExplainers", "shapMaskers"]
        current = {**current, **{name: result for name, result in zip(resultNames, outerCrossValResults)}}
        current["testAUC"] = [roc_auc_score(labels, (probabilities[:,1] if len(probabilities.shape) > 1 else probabilities))
          for labels, probabilities in zip(current["testLabels"], current["probabilities"])]
        
        if config['model']['calculateShapelyExplanations']:
          current['averageShapelyValues'] = pd.DataFrame.from_dict({
            "feature_name": [name for name in current['localExplanations'][0].feature_names],
            "value": [np.mean(
              np.hstack([np.mean(localExplanations.values[:,featureIndex]) for localExplanations in current['localExplanations']])
              ) for featureIndex in range(len(current['localExplanations'][0].feature_names))]
            }, dtype=object).set_index("feature_name")
          
        if current['globalExplanations'][0] is not None:
          current['averageGlobalExplanations'] = pd.concat(current["globalExplanations"]).reset_index().groupby("features").mean()
        
        caseAccuracy = np.mean([np.divide(np.count_nonzero(labels==predictions), len(labels)) for predictions, labels in zip(current["predictions"], current["testLabels"])])
        controlAccuracy = 1 - caseAccuracy
        trackResults(runID, current)
        
        # plot AUC & hyperparameter convergence
        plotSubtitle = f"""
              {config["tracking"]["name"]}, {embedding["samples"].shape[1]} variants
              Minor allele frequency over {'{:.1%}'.format(config['vcfLike']['minAlleleFrequency'])}
              
              {np.count_nonzero(embedding['labels'])} {config["clinicalTable"]["caseAlias"]}s @ {'{:.1%}'.format(caseAccuracy)} accuracy, {len(embedding['labels']) - np.count_nonzero(embedding['labels'])} {config["clinicalTable"]["controlAlias"]}s @ {'{:.1%}'.format(controlAccuracy)} accuracy
              {int(np.around(np.mean([len(indices) for indices in current["trainIndices"]])))}±1 train, {int(np.around(np.mean([len(indices) for indices in current["testIndices"]])))}±1 test samples per x-val fold"""
        results['models'][model.__class__.__name__] = current
        
        # record sample metrics
        for fold in range(config['sampling']['crossValIterations']):
          for j, sampleID in enumerate(current['testIDs'][fold]):
            try:
              results['samples'][sampleID] += current["probabilities"][fold][j]
            except KeyError:
              results['samples'][sampleID] = [current["probabilities"][fold][j]]
            finally:
              results['labels'][sampleID] = current["testLabels"][fold][j]
            
        await build_subflow('trackVisualizations', (runID, plotSubtitle, model.__class__.__name__, current))
      
      results['testCount'] = len(trainIDs)
      results['trainCount'] = len(testIDs)
      return results
    return await classify(*args)
  
  elif name == 'evaluate':
  
    @flow()
    async def evaluate(trainIndices, testIndices, model, labels, samples, variantIndex, sampleIndex, parameterSpace, cvIterator):
      if config['model']['hyperparameterOptimization']:
        fittedOptimizer = optimizeHyperparameters(
            samples[trainIndices], labels[trainIndices], model, 
            parameterSpace, cvIterator, 'neg_mean_squared_error')
        model.set_params(**fittedOptimizer.best_params_)
      else:
        fittedOptimizer = None
      model.fit(samples[trainIndices], labels[trainIndices])
      try:
        probabilities = model.predict_proba(samples[testIndices])
      except AttributeError:
        probabilities = model.predict(samples[testIndices])
        if len(probabilities.shape) <= 1:
          probabilities = np.array([[1 - p, p] for p in probabilities])
      predictions = np.argmax(probabilities, axis=1)
      modelValues, shapValues, shapExplainer, shapMasker = getFeatureImportances(
        model, samples[testIndices], variantIndex)
      globalExplanations = modelValues
      localExplanations = shapValues
      trainLabels = np.array(labels[trainIndices])
      testLabels = np.array(labels[testIndices])
      trainIDs = np.array([sampleIndex[i] for i in trainIndices]) 
      testIDs = np.array([sampleIndex[i] for i in testIndices])
      return globalExplanations, localExplanations, probabilities, predictions, testLabels, trainLabels, trainIDs, testIDs, fittedOptimizer, shapExplainer, shapMasker
    return await evaluate(*args)
  
  elif name == 'trackVisualizations':
  
    @flow()
    async def trackVisualizations(runID, plotSubtitle, modelName, current):
      runTracker = neptune.init_run(project=f'{config["tracking"]["entity"]}/{config["tracking"]["project"]}', with_id=runID, api_token=config['tracking']['token'])
      runTracker['plots/aucPlot'] = plotAUC(f"""
          Receiver Operating Characteristic (ROC) Curve
          {modelName} with {config['sampling']['crossValIterations']}-fold cross-validation
          {plotSubtitle}
          """,
          {f"Fold {k+1}": (current["testLabels"][k], np.array(current["probabilities"][k])[:,1])
            if len(current["probabilities"][k][0].shape) >= 1 
            else (current["testLabels"][k], current["probabilities"][k]) 
            for k in range(config['sampling']['crossValIterations'])},
          )
      if config['model']['hyperparameterOptimization']:
        runTracker['plots/convergencePlot'] = plotOptimizer(f"""
          Hyperparameter convergence, mean squared error
          {modelName} with {config['sampling']['crossValIterations']}-fold cross-validation
          {plotSubtitle}
          """,
          {f"Fold {k+1}": [result for result in current["fittedOptimizers"][k].optimizer_results_]
          for k in range(config['sampling']['crossValIterations'])},
          )
      
      # plot shapely feature importance
      if config['model']['calculateShapelyExplanations']:
        for j in range(config["sampling"]["crossValIterations"]):
          localExplanations = current["localExplanations"][j]
          caseExplanations = localExplanations
          caseExplanations.values = caseExplanations.values[:,:,1] if len(caseExplanations.values.shape) > 2 else caseExplanations.values
          heatmap = plt.figure()
          plt.title(f"""
            Shapely explanations from {modelName}
            Fold {j+1}
            {plotSubtitle}
            """) 
          shap.plots.heatmap(caseExplanations, show=False)
          runTracker[f"plots/featureHeatmap/{j+1}"] = heatmap
          plt.close(heatmap)
          labelsProbabilities = ((current["testLabels"][j], np.array(current["probabilities"][j])[:,1])
            if len(current["probabilities"][j][0].shape) >= 1 
            else (current["testLabels"][j], current["probabilities"][j]))
          stdDeviation = np.std((labelsProbabilities[1] - labelsProbabilities[0])**2)
          for k in range(len(current["testIDs"][j])):
            probability = labelsProbabilities[1][k] if isinstance(labelsProbabilities[1][k], np.ndarray) else labelsProbabilities[1][k]
            label = labelsProbabilities[0][k] if isinstance(labelsProbabilities[0][k], np.ndarray) else labelsProbabilities[0][k]
            if config['tracking']['plotAllSampleImportances'] or np.absolute((probability - label)**2) <= stdDeviation:
              sampleID=current['testIDs'][j][k]
              waterfallPlot = plt.figure()
              plt.title(f"""
                {sampleID}
                Shapely explanations from {modelName}
                Fold {j+1}
                {plotSubtitle}
                """)
              # patch parameter bug: https://github.com/slundberg/shap/issues/2362
              to_pass = SimpleNamespace(**{
                              'values': localExplanations[k].values,
                              'data': localExplanations[k].data,
                              'display_data': None,
                              'feature_names': localExplanations.feature_names,
                              'base_values': localExplanations[k].base_values[current['testLabels'][j][k]] if len(localExplanations[k].base_values.shape) == 1 else localExplanations[k].base_values, 
                })
              shap.plots.waterfall(to_pass, show=False)
              try:
                runTracker[f"plots/samples/{j+1}/{sampleID}"] = waterfallPlot
              except Exception:
                runTracker[f"plots/samples/{j+1}/{sampleID}"] = f"""failed to plot: {traceback.format_exc()}"""
              plt.close(waterfallPlot)
      plt.close('all')
      runTracker.stop()
    await trackVisualizations(*args)
    
@flow(
  task_runner=RayTaskRunner(
    init_kwargs={'address': parallelRunner.address_info['address'], 'configure_logging': True, 'logging_level': logging.WARN}
  ))
async def bootstrapSampling():
  # caseGenotypes, caseIDs, controlGenotypes, controlIDs, clinicalData = await processInputFiles()
  outerCvIterator = StratifiedKFold(n_splits=config['sampling']['crossValIterations'], shuffle=False)
  innerCvIterator = outerCvIterator
  projectTracker = neptune.init_project(project=f'{config["tracking"]["entity"]}/{config["tracking"]["project"]}', api_token=config['tracking']['token'])
  
  results = await asyncio.gather(
    *[build_subflow('classify', (i, innerCvIterator, outerCvIterator)) 
      for i in range(config['sampling']['bootstrapIterations'])])
  
  labelsProbabilitiesByModelName = dict()
  variantCount = 0
  lastVariantCount = 0
  sampleResults = {}
  
  for bootstrapResult in results:
    for sampleID in bootstrapResult['samples'].keys():
      flattenedProbabilities = np.array([
        prediction[1] if len(prediction.shape) > 2 
        else prediction
        for foldResult in bootstrapResult['samples'][sampleID] 
        for prediction in foldResult]) 
        # label, probability, accuracy
      if sampleID not in sampleResults:
        sampleResults[sampleID] = [
          bootstrapResult['labels'][sampleID], 
          [], 
          np.mean([np.ceil(caseProbability) == bootstrapResult['labels'][sampleID] 
                            for caseProbability in flattenedProbabilities])
          ]
      sampleResults[sampleID][1] += bootstrapResult['samples'][sampleID][:1]
      sampleResults[sampleID][2] = np.mean([sampleResults[sampleID][2], np.mean([np.ceil(caseProbability) == bootstrapResult['labels'][sampleID] 
                            for caseProbability in flattenedProbabilities])])
    
    for modelName in bootstrapResult['models'].keys():
      if modelName not in labelsProbabilitiesByModelName: labelsProbabilitiesByModelName[modelName] = [[], []]
      # append labels
      labelsProbabilitiesByModelName[modelName][0] = np.hstack(
        (labelsProbabilitiesByModelName[modelName][0],
        *bootstrapResult['models'][modelName]["testLabels"]))
      # append probabilities
      labelsProbabilitiesByModelName[modelName][1] = np.hstack(
        [labelsProbabilitiesByModelName[modelName][1], 
        np.concatenate(bootstrapResult['models'][modelName]["probabilities"])[:,1] 
          if len(bootstrapResult['models'][modelName]["probabilities"][0].shape) >= 1 
          else np.concatenate(bootstrapResult['models'][modelName]["probabilities"])])
      variantCount = bootstrapResult['models'][modelName]["globalExplanations"][0].shape[0]
      assert lastVariantCount == variantCount or lastVariantCount == 0
      lastVariantCount = variantCount
      
  sampleResultsDataFrame = pd.DataFrame.from_dict(sampleResults, orient="index", columns=["label", "probability", "accuracy"])
  sampleResultsDataFrame.index.name = "id"
  
  projectTracker["sampleResults"].upload(serializeDataFrame(sampleResultsDataFrame))
  
  for modelName in results[0]['models'].keys():
    if 'globalExplanations' not in results[0]['models'][modelName].keys(): continue
    globalExplanationsList = []
    for bootstrapResult in results:
      globalExplanationsList += bootstrapResult['models'][modelName]["globalExplanations"]
    averageGlobalExplanationsDataFrame = pd.concat(globalExplanationsList).reset_index().groupby("features").mean()
    projectTracker[f"averageModelCoefficients/{modelName}"].upload(serializeDataFrame(averageGlobalExplanationsDataFrame))

  if config['model']['calculateShapelyExplanations']:
    averageShapelyExplanationsDataFrame = pd.concat(
      [bootstrapResult['models'][modelName]['averageShapelyValues'] 
       for bootstrapResult in results for modelName in bootstrapResult['models'].keys()]
      ).reset_index().groupby("feature_name").mean()
    projectTracker["averageShapelyExplanations"].upload(serializeDataFrame(averageShapelyExplanationsDataFrame))

  caseAccuracy = sampleResultsDataFrame[sampleResultsDataFrame["label"] == 1]["accuracy"].mean()
  controlAccuracy = 1 - caseAccuracy
  
  bootstrapTrainCount = int(np.around(np.mean([bootstrapResult['trainCount'] for bootstrapResult in results])))
  bootstrapTestCount = int(np.around(np.mean([bootstrapResult['testCount'] for bootstrapResult in results])))
 
  plotSubtitle = f"""
  {config['sampling']['crossValIterations']}x cross-validation over {config['sampling']['bootstrapIterations']} bootstrap iterations
  {config["tracking"]["name"]}, {variantCount} variants
  Minor allele frequency over {'{:.1%}'.format(config['vcfLike']['minAlleleFrequency'])}
  
  {sampleResultsDataFrame['label'].value_counts()[1]} cases @ {'{:.1%}'.format(caseAccuracy)} accuracy, {sampleResultsDataFrame['label'].value_counts()[0]} controls @ {'{:.1%}'.format(controlAccuracy)} accuracy
  {bootstrapTrainCount}±1 train, {bootstrapTestCount}±1 test samples per bootstrap iteration"""

  projectTracker["aucPlot"].upload(plotAUC(f"""
    Receiver Operating Characteristic (ROC) Curve
    {plotSubtitle}
    """,
    labelsProbabilitiesByModelName,
    ))
  
  projectTracker["calibrationPlot"].upload(File.as_image(plotCalibration(f"""
    Calibration Curve
    {plotSubtitle}
    """,
    labelsProbabilitiesByModelName,
    )))
  
  if config['model']['hyperparameterOptimization']:
    projectTracker["convergencePlot"].upload(File.as_image(plotOptimizer(f"""
      Convergence Plot
      {plotSubtitle}
      """,
      {modelName: [result
                  for k in range(config['sampling']['bootstrapIterations'])
                  for foldOptimizer in results[k][modelName]["fittedOptimizers"]
                  for result in foldOptimizer.optimizer_results_
                  ] 
        for modelName in results[0].keys() if modelName != "testLabels"
      })))
  
  projectTracker.stop()
  return results

results = await bootstrapSampling()

https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-333
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-333/metadata


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Matching IDs: 100%|██████████| 2736/2736 [00:00<00:00, 5062.02ID/s]


1368 samples

684 cases

684 controls

Iteration 7 with model LinearSVC


https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-334
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-334/metadata


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Matching IDs: 100%|██████████| 2736/2736 [00:00<00:00, 5152.98ID/s]


1368 samples

684 cases

684 controls

Iteration 10 with model LinearSVC


https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-335
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-335/metadata


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Matching IDs: 100%|██████████| 2736/2736 [00:00<00:00, 5097.73ID/s]


1368 samples

684 cases

684 controls

Iteration 48 with model LinearSVC


https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-336
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-336/metadata


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Matching IDs: 100%|██████████| 2736/2736 [00:00<00:00, 5136.80ID/s]


1368 samples

684 cases

684 controls

Iteration 14 with model LinearSVC


https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-337
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-337/metadata


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Matching IDs: 100%|██████████| 2736/2736 [00:00<00:00, 4954.33ID/s]


1368 samples

684 cases

684 controls

Iteration 11 with model LinearSVC


https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-338
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-338/metadata


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Matching IDs: 100%|██████████| 2736/2736 [00:00<00:00, 5129.80ID/s]


1368 samples

684 cases

684 controls

Iteration 40 with model LinearSVC


https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-339
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-339/metadata


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Matching IDs: 100%|██████████| 2736/2736 [00:00<00:00, 4981.90ID/s]


1368 samples

684 cases

684 controls

Iteration 30 with model LinearSVC


https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-340
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 12 operations to synchronize with Neptune. Do not kill this process.
All 12 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-340/metadata


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Matching IDs: 100%|██████████| 2736/2736 [00:00<00:00, 4874.26ID/s]


1368 samples

684 cases

684 controls

Iteration 44 with model LinearSVC


https://new-ui.neptune.ai/ejmockler/ALS-NUPs-NoHyperParamOptimization/e/NUPNOPARAM-341
