# Whole-genome analysis workflow

In [2]:
# ~2 minutes to install 
#%pip install -U --no-cache-dir scikit-learn scikit-optimize prefect prefect-ray ray plotly openpyxl shap lion_pytorch pytorch_tabnet xgboost neptune pyspark pyarrow dill fastnumbers

In [1]:
from prefect import task, flow
from prefect.task_runners import ConcurrentTaskRunner
from prefect_ray.task_runners import RayTaskRunner
import ray

import pandas as pd
import numpy as np

import logging

!export PREFECT_LOGGING_LEVEL="WARNING"
ray.shutdown()
parallelRunner = ray.init(
  configure_logging=True,
  logging_level=logging.ERROR,
)
parallelRunner

0,1
Python version:,3.10.10
Ray version:,2.5.1


In [5]:
from sklearn.ensemble import (
    AdaBoostClassifier,
    RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from lion_pytorch import Lion

from skopt.space import Categorical, Integer, Real

from env import neptune_api_token

RadialBasisSVC = SVC
RadialBasisSVC.__name__ = "RadialBasisSVC"

clearHistory = False

config = {
    "vcfLike": {
        "path": "../notebook/Variant_report_NUPs_fixed_2022-03-28.xlsx",  # variant call table with annotations
        "sheet": "all cases vs all controls",  # sheet name if Excel spreadsheet
        "indexColumn": [
            "chrom",
            "position",
            "Gene",
        ],  # header that indexes variants (set as list with multiple columns)
        "compoundSampleIdDelimiter": "__",  # delimiter for compound sample IDs in column names
        "compoundSampleIdStartIndex": 1,  # index of first sample ID in compound sample ID
        "binarize": True,  # binarize variants to 0/1, or sum to weigh allele frequency
        "minAlleleFrequency": 0.05,  # filter out variants with allele frequency less than this
        # 'alleleModel': ['dominant', 'recessive', 'overDominant'],  # biallelic allele models to test on gene sets
        "filters": {},
    },  # TODO handle genotypes from related individuals
    "geneSets": {},  # TODO gene sets
    "tracking": {
        "name": "Nucleoporin genes",  # name of the experiment
        "entity": "ejmockler",
        "project": "ALS-NUPS-60",
        "plotAllSampleImportances": True,  # if calculating Shapely explanations, plot each sample in Neptune
        "token": neptune_api_token,
        "remote": False,  # if True, log to Neptune
    },
    "clinicalTable": {
        "path": "../notebook/ACWM.xlsx",  # clinical data as Excel spreadsheet
        "idColumn": "ExternalSampleId",  # genotype ID header
        "subjectIdColumn": "ExternalSubjectId",  # unique ID for each patient
        "labelColumn": "Subject Group",  # header that has case/control labels
        "controlLabels": [
            "Non-Neurological Control"
        ],  # these labels include external sample IDs (like 1000 Genomes)
        "caseLabels": ["ALS Spectrum MND"],  # "ALS Spectrum MND"
        "controlAlias": "control",
        "caseAlias": "case",
        "filters": "pct_european>=0.85",  # filter out nonhomogenous samples with less than 85% European ancestry
    },
    "externalTables": {
        "path": [
            "../notebook/igsr-1000 genomes phase 3 release.tsv",
            # "../notebook/ALS-NUPS-2000__accurateSamples_>=97.5%.csv",
            "../notebook/ACWM_ethnicallyVariable.tsv",
            "../notebook/ACWM_ethnicallyVariable.tsv",
            "../notebook/igsr-1000 genomes phase 3 release.tsv",
        ],  # external sample table
        "label": [
            "control",
            # "case",
            "case",
            "control",
            "control",
        ],  # case | control
        "setType": [
            "crossval",
            # "crossval",
            "holdout",
            "holdout",
            "holdout",
        ],
        "idColumn": [
            "Sample name",
            # "id",
            "ExternalSubjectId",
            "ExternalSubjectId",
            "Sample name",
        ],  # sample ID header
        "filters": [
            "`Superpopulation code`=='EUR' & `Population name`!='Finnish'",  # remove finnish samples due to unusual homogeneity (verify w/ PCA)
            # "`testLabel`==1",
            "`Subject Group`=='ALS Spectrum MND' & `pct_european`<0.85",
            "`Subject Group`=='Non-Neurological Control' & `pct_european`<0.85",
            "`Superpopulation code`!='EUR' & `Population name`!='Finnish'",
        ],
    },
    "sampling": {
        "bootstrapIterations": 2,
        "crossValIterations": 2,  # number of validations per bootstrap iteration
        "holdoutSplit": 0.1,
        "lastIteration": 0,
        "sequesteredIDs": [],
    },
    "model": {
        "hyperparameterOptimization": True,
        "calculateShapelyExplanations": False,
    },
}

 
async def remove_all_flows():
  from prefect.client import get_client
  orion_client = get_client()
  flows = await orion_client.read_flows()
  for flow in flows:
    flow_id = flow.id
    print(f"Deleting flow: {flow.name}, {flow_id}")
    await orion_client._client.delete(f"/flows/{flow_id}")
    print(f"Flow with UUID {flow_id} deleted")

if clearHistory: await remove_all_flows()

In [6]:
from prefect import unmapped
from tqdm import tqdm

from tasks.input import processInputFiles

(caseGenotypes,
caseIDs,
holdoutCaseGenotypes,
holdoutCaseIDs,
controlGenotypes,
controlIDs,
holdoutControlGenotypes,
holdoutControlIDs,
clinicalData) = await processInputFiles(config)

print(f"\nclinical data:\n{clinicalData.head()}")

  warn(msg)


100%|██████████| 922/922 [01:44<00:00,  8.86id/s]s]
 57%|█████▋    | 1652/2904 [01:44<02:05,  9.98id/s]

100%|██████████| 2904/2904 [02:52<00:00, 16.82id/s]


100%|██████████| 319/319 [00:36<00:00,  8.75id/s]]
 10%|▉         | 429/4309 [00:36<05:23, 11.99id/s]

100%|██████████| 4309/4309 [02:27<00:00, 29.22id/s] 



clinical data:
                       Quote    Data File ID ExternalSubjectId   
ExternalSampleId                                                 
CGND-HDA-05557    CGND_14852  CGND-HDA-05557       NEUUF013XXL  \
CGND-HDA-05556    CGND_14852  CGND-HDA-05556       NEUHM496PGR   
CGND-HDA-05555    CGND_14852  CGND-HDA-05555       NEUPK599KHH   
CGND-HDA-05554    CGND_14852  CGND-HDA-05554       NEUHD589CVP   
CGND-HDA-05553    CGND_14852  CGND-HDA-05553       NEUXX223WT8   

                              Project     Site Sample Collected   
ExternalSampleId                                                  
CGND-HDA-05557    ALS Natural History  Henry Ford Health System  \
CGND-HDA-05556    ALS Natural History  Henry Ford Health System   
CGND-HDA-05555    ALS Natural History  Henry Ford Health System   
CGND-HDA-05554    ALS Natural History  Henry Ford Health System   
CGND-HDA-05553    ALS Natural History  Henry Ford Health System   

                   Site Specimen Collected     Sex 

In [17]:
def findBaselineFeature(caseGenotypes, controlGenotypes):
    # calculate the mean of each feature for cases and controls
    mean_cases = caseGenotypes.mean(axis=1)
    mean_controls = controlGenotypes.mean(axis=1)

    # calculate the absolute difference in means for each feature
    diff_means = abs(mean_cases - mean_controls)

    # get the feature with the largest difference in means
    selected_feature = diff_means.idxmax()

    print("Selected Feature for baseline perplexity: ", selected_feature)
    return selected_feature

caseGenotypes.loc[findBaselineFeature(caseGenotypes, controlGenotypes)]


Selected Feature for baseline perplexity:  ('6', '17675015', 'NUP153')


ALS__CGND-HDA-04091__NEUHF998PCY         1.0
aals-ALS__CGND-HDA-04089__NEUEU419NMF    1.0
aals-ALS__CGND-HDA-04086__NEUDH813DE6    1.0
aals-ALS__CGND-HDA-04085__NEUXZ486GG5    0.0
aals-ALS__CGND-HDA-04084__NEUHZ364FZW    0.0
                                        ... 
ALS__CGND-HDA-00013__UP-WGS-196          1.0
ALS__CGND-HDA-00012__UP-WGS-195          1.0
ALS__CGND-HDA-00008__UP-WGS-191          0.0
ALS__CGND-HDA-00004__UP-WGS-187          1.0
ALS__CGND-HDA-00001__UP-WGS-185          0.0
Name: (6, 17675015, NUP153), Length: 2052, dtype: float64

In [46]:
from joblib import Parallel, delayed
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

from models import stack as modelStack
from mlStack import bootstrap, serializeBootstrapResults

from metaconfig import metaconfig


def relativePerplexity(y_true, y_pred, y_true_baseline, y_pred_baseline, epsilon=1e-15):
    samplePerplexity = perplexity(y_true, y_pred)
    baselineSamplePerplexity = perplexity(y_true_baseline, y_pred_baseline)
    
    return pd.Series([samplePerplexity, baselineSamplePerplexity, np.divide(
            samplePerplexity, baselineSamplePerplexity + epsilon
        )])  # relative perplexity = perplexity / perplexity of model with single-most case correlated feature

def getBaselineFeatureResults(
    caseGenotypes,
    controlGenotypes,
    holdoutCaseGenotypes,
    holdoutControlGenotypes,
    clinicalData,
    config
):
    selectedFeature = findBaselineFeature(caseGenotypes, controlGenotypes)
    outerCvIterator = StratifiedKFold(
        n_splits=config["sampling"]["crossValIterations"], shuffle=False
    )
    innerCvIterator = outerCvIterator

    bootstrap_args = [
        (
            caseGenotypes.loc[[selectedFeature]],
            controlGenotypes.loc[[selectedFeature]],
            holdoutCaseGenotypes.loc[[selectedFeature]],
            holdoutControlGenotypes.loc[[selectedFeature]],
            clinicalData,
            model,
            hyperParameterSpace,
            innerCvIterator,
            outerCvIterator,
            config,
            False,  # disable tracking
        )
        for model, hyperParameterSpace in list(modelStack.items())
    ]
    results = Parallel(n_jobs=-1)(delayed(bootstrap)(*args) for args in bootstrap_args)

    baselineFeatureResults = {}
    for i in range(len(modelStack)):
        modelResults = results[i]
        baselineFeatureResults = serializeBootstrapResults(
            modelResults, baselineFeatureResults
        )
    baselineFeatureResultsDataframe = pd.DataFrame.from_dict(
        baselineFeatureResults,
        orient="index",
        columns=["label", "probability", "accuracy"],
    )
    baselineFeatureResultsDataframe.index.name = "id"

    return baselineFeatureResultsDataframe, selectedFeature


def perplexity(y_true, y_pred):
    crossEntropy = log_loss(
        y_true, y_pred, labels=[0, 1], eps=1e-15
    )  # linear predictions (exactly 0 or 1) depend on offset of 1e-15 when log is applied to avoid NaN

    # perplexity = 2 ^ crossEntropy
    return np.power(2, crossEntropy)


def findBaselineFeature(caseGenotypes, controlGenotypes):
    # calculate the mean of each feature for cases and controls
    mean_cases = caseGenotypes.mean(axis=1)
    mean_controls = controlGenotypes.mean(axis=1)

    # calculate the absolute difference in means for each feature
    diff_means = abs(mean_cases - mean_controls)

    # get the feature with the largest difference in means
    selected_feature = diff_means.idxmax()

    print("Selected feature for baseline perplexity: ", selected_feature)
    return selected_feature


currentResults = pd.read_csv(
            f"projects/{config['tracking']['project']}__1/sampleResults.csv",
            index_col="id",
        )
baselineFeatureResults, selectedFeature = getBaselineFeatureResults(
    caseGenotypes,
    controlGenotypes,
    holdoutCaseGenotypes,
    holdoutControlGenotypes,
    clinicalData,
    config
)
# serialize probability arrays from string
currentResults["probability"] = currentResults["probability"].apply(lambda x: np.array(eval(x))[:, 1])
# take intersection of bootstrapped samples
currentResults = currentResults.loc[baselineFeatureResults.index.intersection(currentResults.index)]
baselineFeatureResults = baselineFeatureResults.loc[currentResults.index]
currentResults["baselineProbability"] = baselineFeatureResults["probability"]

relativePerplexities = pd.DataFrame(index=currentResults.index)
new_cols = currentResults.apply(
    lambda row: relativePerplexity(
        [row["label"]] * len(row["probability"]),
        row["probability"],
        [row["label"]] * len(row["baselineProbability"]),
        row["baselineProbability"],
    ),
    axis=1,
    result_type='expand'
)

relativePerplexities["all features"], relativePerplexities[f"{selectedFeature}"], relativePerplexities["relative"] = new_cols[0], new_cols[1], new_cols[2]


Selected Feature for baseline perplexity:  ('6', '17675015', 'NUP153')


07:09:54.819 | [36mINFO[0m    | prefect.engine - Created flow run[35m 'hypnotic-rhino'[0m for flow[1;35m 'bootstrap'[0m
07:09:54.821 | [36mINFO[0m    | Flow run[35m 'hypnotic-rhino'[0m - View at [94mhttp://127.0.0.1:4200/flow-runs/flow-run/0ef8bffd-2ca9-40c9-a762-999698ca0e13[0m
07:09:54.821 | [36mINFO[0m    | prefect.task_runner.ray - Creating a local Ray instance
07:09:56.070 | [36mINFO[0m    | prefect.engine - Created flow run[35m 'precious-monkey'[0m for flow[1;35m 'bootstrap'[0m
07:09:56.072 | [36mINFO[0m    | Flow run[35m 'precious-monkey'[0m - View at [94mhttp://127.0.0.1:4200/flow-runs/flow-run/f90a6365-f897-402d-b100-7b49e61aca76[0m
07:09:56.072 | [36mINFO[0m    | prefect.task_runner.ray - Creating a local Ray instance
2023-07-01 07:09:56,363	INFO worker.py:1636 -- Started a local Ray instance.
07:09:57.035 | [36mINFO[0m    | prefect.task_runner.ray - Using Ray cluster with 1 nodes.
2023-07-01 07:09:57,612	INFO worker.py:1636 -- Started a local Ra


1368 for training:
['aals-ALS__CGND-HDA-04086__NEUDH813DE6', 'aals-ALS__CGND-HDA-04083__NEURN392PGA', 'aals-ALS__CGND-HDA-04072__NEUHR014RCJ', 'aals-ALS__CGND-HDA-04068__NEUAX021NPV', 'aals-ALS__CGND-HDA-04063__NEUJU951LU2', 'aals-ALS__CGND-HDA-04062__NEUKD887WR2', 'aals-ALS__CGND-HDA-04060__NEUUD295LJT', 'aals-ALS__CGND-HDA-04059__NEUJH152CX9', 'aals-ALS__CGND-HDA-04054__NEUEX525RTD', 'aals-ALS__CGND-HDA-04053__NEUYE187ALF', 'aals-ALS__CGND-HDA-04052__NEUCU166PBM', 'ALS__CGND-HDA-04039__NEUXB638VWD', 'ALS__CGND-HDA-04037__NEUXG797NG8', 'ALS__CGND-HDA-04029__NEUGK689BJ1', 'ALS__CGND-HDA-04026__NEUDC217VWH', 'ALS__CGND-HDA-04024__TD-ALS-189', 'ALS__CGND-HDA-04022__TD-ALS-187', 'ALS__CGND-HDA-04017__TD-ALS-182', 'ALS__CGND-HDA-04012__TD-ALS-177', 'ALS__CGND-HDA-04009__TD-ALS-174', 'ALS__CGND-HDA-03997__TD-ALS-162', 'ALS__CGND-HDA-03995__TD-ALS-160', 'ALS__CGND-HDA-03991__TD-ALS-156', 'ALS__CGND-HDA-03984__NEUYZ797CJH', 'ALS__CGND-HDA-03976__NEUYW510RCG', 'ALS__CGND-HDA-03973__NEUCJ782FW

Matching IDs: 100%|██████████| 4967/4967 [00:01<00:00, 3332.93ID/s]



1368 for training:
['ALS__CGND-HDA-04091__NEUHF998PCY', 'aals-ALS__CGND-HDA-04089__NEUEU419NMF', 'aals-ALS__CGND-HDA-04083__NEURN392PGA', 'aals-ALS__CGND-HDA-04079__NEUMT573TE9', 'aals-ALS__CGND-HDA-04073__NEUBZ512CWM', 'aals-ALS__CGND-HDA-04062__NEUKD887WR2', 'aals-ALS__CGND-HDA-04060__NEUUD295LJT', 'aals-ALS__CGND-HDA-04056__NEUYK661FBQ', 'aals-ALS__CGND-HDA-04054__NEUEX525RTD', 'aals-ALS__CGND-HDA-04052__NEUCU166PBM', 'ALS__CGND-HDA-04041__NEUZN534BRL', 'ALS__CGND-HDA-04040__NEUPX734FKW', 'ALS__CGND-HDA-04039__NEUXB638VWD', 'ALS__CGND-HDA-04035__NEUCD014NG4', 'ALS__CGND-HDA-04031__NEUPA903TLQ', 'ALS__CGND-HDA-04030__NEURY111ZM2', 'ALS__CGND-HDA-04026__NEUDC217VWH', 'ALS__CGND-HDA-04017__TD-ALS-182', 'ALS__CGND-HDA-04014__TD-ALS-179', 'ALS__CGND-HDA-04010__TD-ALS-175', 'ALS__CGND-HDA-04009__TD-ALS-174', 'ALS__CGND-HDA-04005__TD-ALS-170', 'ALS__CGND-HDA-04000__TD-ALS-165', 'ALS__CGND-HDA-03998__TD-ALS-163', 'ALS__CGND-HDA-03997__TD-ALS-162', 'ALS__CGND-HDA-03995__TD-ALS-160', 'ALS__C

Matching IDs: 100%|██████████| 4967/4967 [00:01<00:00, 3065.88ID/s]
Matching IDs:  11%|█         | 542/4967 [00:00<00:01, 2339.17ID/s]


1368 for training:
['aals-ALS__CGND-HDA-04086__NEUDH813DE6', 'aals-ALS__CGND-HDA-04081__NEUAD952KAZ', 'aals-ALS__CGND-HDA-04073__NEUBZ512CWM', 'aals-ALS__CGND-HDA-04072__NEUHR014RCJ', 'aals-ALS__CGND-HDA-04071__NEUCD063FGD', 'aals-ALS__CGND-HDA-04066__NEUGV456PJ3', 'aals-ALS__CGND-HDA-04062__NEUKD887WR2', 'aals-ALS__CGND-HDA-04056__NEUYK661FBQ', 'aals-ALS__CGND-HDA-04052__NEUCU166PBM', 'ALS__CGND-HDA-04048__NEUVR250XF0', 'ALS__CGND-HDA-04047__NEUKW580AKZ', 'ALS__CGND-HDA-04041__NEUZN534BRL', 'ALS__CGND-HDA-04040__NEUPX734FKW', 'ALS__CGND-HDA-04037__NEUXG797NG8', 'ALS__CGND-HDA-04035__NEUCD014NG4', 'ALS__CGND-HDA-04030__NEURY111ZM2', 'ALS__CGND-HDA-04026__NEUDC217VWH', 'ALS__CGND-HDA-04022__TD-ALS-187', 'ALS__CGND-HDA-04018__TD-ALS-183', 'ALS__CGND-HDA-04017__TD-ALS-182', 'ALS__CGND-HDA-04014__TD-ALS-179', 'ALS__CGND-HDA-04013__TD-ALS-178', 'ALS__CGND-HDA-04009__TD-ALS-174', 'ALS__CGND-HDA-04006__TD-ALS-171', 'ALS__CGND-HDA-04002__TD-ALS-167', 'ALS__CGND-HDA-03996__TD-ALS-161', 'ALS__C

Matching IDs:  25%|██▌       | 1260/4967 [00:00<00:01, 2291.06ID/s]07:10:52.334 | [36mINFO[0m    | Task run 'prepareDatasets-0' - Finished in state [32mCompleted[0m()
Matching IDs:  31%|███       | 1520/4967 [00:00<00:01, 2390.86ID/s]

1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 1 with model LinearSVC


Matching IDs:  45%|████▌     | 2251/4967 [00:00<00:01, 2401.63ID/s]07:10:52.725 | [36mINFO[0m    | Task run 'prepareDatasets-0' - Finished in state [32mCompleted[0m()
Matching IDs:   9%|▉         | 469/4967 [00:00<00:01, 2358.47ID/s]07:10:52.856 | [36mINFO[0m    | Task run 'prepareDatasets-0' - Finished in state [32mCompleted[0m()
Matching IDs:  58%|█████▊    | 2857/4967 [00:01<00:00, 2709.87ID/s]

1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 1 with model XGBClassifier
1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 1 with model AdaBoostClassifier


Matching IDs: 100%|██████████| 4967/4967 [00:02<00:00, 2318.44ID/s]
Matching IDs:  58%|█████▊    | 2899/4967 [00:01<00:00, 2497.23ID/s]


1368 for training:
['aals-ALS__CGND-HDA-04089__NEUEU419NMF', 'aals-ALS__CGND-HDA-04086__NEUDH813DE6', 'aals-ALS__CGND-HDA-04085__NEUXZ486GG5', 'aals-ALS__CGND-HDA-04084__NEUHZ364FZW', 'aals-ALS__CGND-HDA-04082__NEUTB997GDW', 'aals-ALS__CGND-HDA-04081__NEUAD952KAZ', 'aals-ALS__CGND-HDA-04079__NEUMT573TE9', 'aals-ALS__CGND-HDA-04076__NEUDZ810BCM', 'aals-ALS__CGND-HDA-04072__NEUHR014RCJ', 'aals-ALS__CGND-HDA-04071__NEUCD063FGD', 'aals-ALS__CGND-HDA-04064__NEUKW840TXJ', 'aals-ALS__CGND-HDA-04063__NEUJU951LU2', 'aals-ALS__CGND-HDA-04062__NEUKD887WR2', 'aals-ALS__CGND-HDA-04060__NEUUD295LJT', 'aals-ALS__CGND-HDA-04055__NEUPW567EGG', 'ALS__CGND-HDA-04047__NEUKW580AKZ', 'ALS__CGND-HDA-04045__NEUWE409BEK', 'ALS__CGND-HDA-04044__NEUEM180ZTU', 'ALS__CGND-HDA-04040__NEUPX734FKW', 'ALS__CGND-HDA-04035__NEUCD014NG4', 'ALS__CGND-HDA-04024__TD-ALS-189', 'ALS__CGND-HDA-04022__TD-ALS-187', 'ALS__CGND-HDA-04018__TD-ALS-183', 'ALS__CGND-HDA-04012__TD-ALS-177', 'ALS__CGND-HDA-04008__TD-ALS-173', 'ALS__CGN

Matching IDs:  75%|███████▍  | 3720/4967 [00:01<00:00, 2182.81ID/s]07:10:54.624 | [36mINFO[0m    | Task run 'prepareDatasets-0' - Finished in state [32mCompleted[0m()
Matching IDs:  79%|███████▉  | 3940/4967 [00:01<00:00, 2129.60ID/s]The objective has been evaluated at this point before.
Matching IDs:  84%|████████▎ | 4154/4967 [00:01<00:00, 2088.04ID/s]

1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 1 with model RandomForestClassifier


Matching IDs: 100%|██████████| 4967/4967 [00:02<00:00, 2164.14ID/s]
Matching IDs: 100%|██████████| 4967/4967 [00:02<00:00, 2165.44ID/s]
Matching IDs:  92%|█████████▏| 4573/4967 [00:02<00:00, 2074.28ID/s]


1368 for training:
['ALS__CGND-HDA-04091__NEUHF998PCY', 'aals-ALS__CGND-HDA-04089__NEUEU419NMF', 'aals-ALS__CGND-HDA-04085__NEUXZ486GG5', 'aals-ALS__CGND-HDA-04081__NEUAD952KAZ', 'aals-ALS__CGND-HDA-04076__NEUDZ810BCM', 'aals-ALS__CGND-HDA-04073__NEUBZ512CWM', 'aals-ALS__CGND-HDA-04072__NEUHR014RCJ', 'aals-ALS__CGND-HDA-04067__NEUJA207UUV', 'aals-ALS__CGND-HDA-04066__NEUGV456PJ3', 'aals-ALS__CGND-HDA-04063__NEUJU951LU2', 'aals-ALS__CGND-HDA-04056__NEUYK661FBQ', 'aals-ALS__CGND-HDA-04053__NEUYE187ALF', 'ALS__CGND-HDA-04051__NEUNN067PW8', 'ALS__CGND-HDA-04041__NEUZN534BRL', 'ALS__CGND-HDA-04035__NEUCD014NG4', 'ALS__CGND-HDA-04031__NEUPA903TLQ', 'ALS__CGND-HDA-04026__NEUDC217VWH', 'ALS__CGND-HDA-04023__TD-ALS-188', 'ALS__CGND-HDA-04022__TD-ALS-187', 'ALS__CGND-HDA-04017__TD-ALS-182', 'ALS__CGND-HDA-04014__TD-ALS-179', 'ALS__CGND-HDA-04013__TD-ALS-178', 'ALS__CGND-HDA-04010__TD-ALS-175', 'ALS__CGND-HDA-04006__TD-ALS-171', 'ALS__CGND-HDA-04005__TD-ALS-170', 'ALS__CGND-HDA-04002__TD-ALS-167

The objective has been evaluated at this point before.
Matching IDs: 100%|██████████| 4967/4967 [00:02<00:00, 2130.52ID/s]



1368 for training:
['aals-ALS__CGND-HDA-04086__NEUDH813DE6', 'aals-ALS__CGND-HDA-04082__NEUTB997GDW', 'aals-ALS__CGND-HDA-04080__NEUWM375BWE', 'aals-ALS__CGND-HDA-04078__NEUTE443RWG', 'aals-ALS__CGND-HDA-04074__NEUNY753VVK', 'aals-ALS__CGND-HDA-04073__NEUBZ512CWM', 'aals-ALS__CGND-HDA-04071__NEUCD063FGD', 'aals-ALS__CGND-HDA-04065__NEULP450TP2', 'aals-ALS__CGND-HDA-04063__NEUJU951LU2', 'aals-ALS__CGND-HDA-04062__NEUKD887WR2', 'aals-ALS__CGND-HDA-04059__NEUJH152CX9', 'aals-ALS__CGND-HDA-04056__NEUYK661FBQ', 'aals-ALS__CGND-HDA-04055__NEUPW567EGG', 'ALS__CGND-HDA-04044__NEUEM180ZTU', 'ALS__CGND-HDA-04037__NEUXG797NG8', 'ALS__CGND-HDA-04036__NEUMV490RGY', 'ALS__CGND-HDA-04035__NEUCD014NG4', 'ALS__CGND-HDA-04026__NEUDC217VWH', 'ALS__CGND-HDA-04021__TD-ALS-186', 'ALS__CGND-HDA-04020__TD-ALS-185', 'ALS__CGND-HDA-04017__TD-ALS-182', 'ALS__CGND-HDA-04016__TD-ALS-181', 'ALS__CGND-HDA-04014__TD-ALS-179', 'ALS__CGND-HDA-03996__TD-ALS-161', 'ALS__CGND-HDA-03993__TD-ALS-158', 'ALS__CGND-HDA-03992_

The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
07:10:55.604 | [36mINFO[0m    | Task run 'prepareDatasets-0' - Finished in state [32mCompleted[0m()
07:10:55.605 | [36mINFO[0m    | Task run 'prepareDatasets-0' - Finished in state [32mCompleted[0m()
The objective has been evaluated at this point before.


1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 1 with model RadialBasisSVC
1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 1 with model BernoulliNB


07:10:56.104 | [36mINFO[0m    | Task run 'prepareDatasets-0' - Finished in state [32mCompleted[0m()
The objective has been evaluated at this point before.


1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 1 with model LogisticRegression


The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The object

1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 2 with model LinearSVC


Matching IDs:  27%|██▋       | 1321/4967 [00:00<00:01, 2624.30ID/s]The objective has been evaluated at this point before.
Matching IDs:  88%|████████▊ | 4393/4967 [00:01<00:00, 2055.99ID/s]The objective has been evaluated at this point before.
Matching IDs: 100%|██████████| 4967/4967 [00:01<00:00, 2558.85ID/s]
Matching IDs:  80%|███████▉  | 3972/4967 [00:01<00:00, 3037.16ID/s]The objective has been evaluated at this point before.
Matching IDs: 100%|██████████| 4967/4967 [00:01<00:00, 2947.21ID/s]
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
07:11:45.231 | [36mINFO[0m    | Flow run[35m 'naughty-moose'[0m - Created task run 'prepareDatasets-0' for task 'prepareDatasets'
07:11:45.232 | [36mINFO[0m    | Flow run[35m 'naughty-moose'[0m - Executing 'prepareDatasets-0' immediately...
07:11:45.289 | [36mINFO[0m 

1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 2 with model BernoulliNB
1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 2 with model XGBClassifier


07:11:45.456 | [36mINFO[0m    | Flow run[35m 'savvy-inchworm'[0m - Created task run 'prepareDatasets-0' for task 'prepareDatasets'
07:11:45.457 | [36mINFO[0m    | Flow run[35m 'savvy-inchworm'[0m - Executing 'prepareDatasets-0' immediately...
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
07:11:47.155 | [36mINFO[0m    | 

1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 2 with model RadialBasisSVC
1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 2 with model LogisticRegression


07:11:54.475 | [36mINFO[0m    | Task run 'prepareDatasets-0' - Finished in state [32mCompleted[0m()
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
07:11:55.945 | [36mINFO[0m    | Flow run[35m 'rough-mantis'[0m - Created subflow run[35m 'uncovered-dogfish'[0m for flow[1;35m 'classify'[0m
07:11:55.946 | [36mINFO[0m    | Flow run[35m 'uncovered-dogfish'[0m - View at [94mhttp://127.0.0.1:4200/flow-runs/flow-run/48ef2eeb-6980-4038-bf26-8dc5eb1b93b4[0m
07:11:55.948 | [36mINFO[0m    | prefect.task_runner.ray - Local Ray instance is a

1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 2 with model AdaBoostClassifier


The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
07:12:01.374 | [36mINFO[0m    | Flow run[35m 'careful-dove'[0m - Finished in state [32mCompleted[0m()
The objectiv

1368 samples

684 cases

684 controls

460 holdout samples

230 holdout cases

230 holdout controls

Iteration 2 with model RandomForestClassifier


07:12:04.980 | [36mINFO[0m    | Flow run[35m 'hypnotic-rhino'[0m - Finished in state [32mCompleted[0m()
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
07:12:05.685 | [36mINFO[0m    | Flow run[35m 'positive-sponge'[0m - Finished in state [32mCompleted[0m()
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
07:12:06.772 | [36mINFO[0m    | Flow run[35m 'enigmatic-tortoise'[0m - Finished in state [32mCompleted[0m()
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The objective has been evaluated at this point before.
The o

In [47]:
relativePerplexities

Unnamed: 0_level_0,all features,"('6', '17675015', 'NUP153')",relative
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALS__CGND-HDA-04091__NEUHF998PCY,1.324706,4027.917073,0.000329
aals-ALS__CGND-HDA-04089__NEUEU419NMF,1.146718,159.976571,0.007168
aals-ALS__CGND-HDA-04083__NEURN392PGA,4569.180185,42.984855,106.297443
aals-ALS__CGND-HDA-04079__NEUMT573TE9,212899.177204,569.180350,374.045199
aals-ALS__CGND-HDA-04073__NEUBZ512CWM,1.593855,571.154062,0.002791
...,...,...,...
NA19059,1.247831,1.614929,0.772685
NA19080,161.349464,1.614929,99.911170
NA19143,1.637351,1.614929,1.013884
NA20126,1.615075,1.614929,1.000091


In [26]:
currentResults.index

Index(['ALS__CGND-HDA-04091__NEUHF998PCY',
       'aals-ALS__CGND-HDA-04086__NEUDH813DE6',
       'aals-ALS__CGND-HDA-04083__NEURN392PGA',
       'aals-ALS__CGND-HDA-04082__NEUTB997GDW',
       'aals-ALS__CGND-HDA-04081__NEUAD952KAZ',
       'aals-ALS__CGND-HDA-04079__NEUMT573TE9',
       'aals-ALS__CGND-HDA-04078__NEUTE443RWG',
       'aals-ALS__CGND-HDA-04069__NEUEA668FYK',
       'aals-ALS__CGND-HDA-04068__NEUAX021NPV',
       'aals-ALS__CGND-HDA-04067__NEUJA207UUV',
       ...
       'NA20877', 'NA20906', 'NA19376', 'NA18950', 'NA18998', 'NA19908',
       'NA20126', 'NA20859', 'ALS__CGND-HDA-00651__MH-WASHU-29',
       'ALS__CGND-HDA-00314__162ALS'],
      dtype='object', name='id', length=4606)

In [27]:
baselineFeatureResults.index

Index(['aals-ALS__CGND-HDA-04082__NEUTB997GDW',
       'aals-ALS__CGND-HDA-04081__NEUAD952KAZ',
       'aals-ALS__CGND-HDA-04079__NEUMT573TE9',
       'aals-ALS__CGND-HDA-04075__NEUZA643DHA',
       'aals-ALS__CGND-HDA-04069__NEUEA668FYK',
       'aals-ALS__CGND-HDA-04064__NEUKW840TXJ',
       'ALS__CGND-HDA-04049__NEURN540KF7', 'ALS__CGND-HDA-04048__NEUVR250XF0',
       'ALS__CGND-HDA-04045__NEUWE409BEK', 'ALS__CGND-HDA-04044__NEUEM180ZTU',
       ...
       'HG03789', 'NA18614', 'HG04225', 'NA19446', 'NA19129', 'NA19131',
       'NA19782', 'NA19908', 'ALS__CGND-HDA-00925__MH-WASHU-303',
       'ALS__CGND-HDA-00795__MH-WASHU-173'],
      dtype='object', name='id', length=4615)

In [4]:
len(holdoutControlIDs)

2012

## Evaluate model stack

In [3]:
import neptune
from sklearn.model_selection import StratifiedKFold
from mlStackEntrypoint import classify
from config import config

outerCvIterator = StratifiedKFold(
    n_splits=config["sampling"]["crossValIterations"], shuffle=False
)
innerCvIterator = outerCvIterator
if config["tracking"]["remote"]:
    projectTracker = neptune.init_project(
        project=f'{config["tracking"]["entity"]}/{config["tracking"]["project"]}',
        api_token=config["tracking"]["token"],
    )

results = []
for model, hyperParameterSpace in list(config["model"]["stack"].items()):
    results.append(
        await classify(
            caseGenotypes,
            controlGenotypes,
            holdoutCaseGenotypes,
            holdoutControlGenotypes,
            clinicalData,
            model,
            hyperParameterSpace,
            innerCvIterator,
            outerCvIterator,
        ),
    )

NameError: name 'config' is not defined

In [6]:
from sklearn import datasets
X, y  = datasets.make_classification(n_samples=400)

def train_data(model, X=X, y=y):
    clf = model
    clf.fit(X, y)
    
    
from sklearn.linear_model import LogisticRegression 
from cuml.linear_model import LogisticRegression as LogisticRegression_gpu

sklearn_time_svc = %timeit -o train_data(LogisticRegression(penalty="l2", solver="saga"))

from cuml.common.device_selection import using_device_type
with using_device_type('gpu'):
    cuml_time_svc = %timeit -o train_data(LogisticRegression_gpu(penalty="l2"))

print(f"""Average time of sklearn's {LogisticRegression.__name__}""", sklearn_time_svc.average, 's')
print(f"""Average time of cuml's {LogisticRegression_gpu.__name__}""", cuml_time_svc.average, 's')

print('Ratio between sklearn and cuml is', sklearn_time_svc.average/cuml_time_svc.average)

2.29 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
7.53 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Average time of sklearn's LogisticRegression 0.002288513623003382 s
Average time of cuml's LogisticRegression 0.007530984568542668 s
Ratio between sklearn and cuml is 0.30387973872136553
