In [1]:
import os
import torch
from torch.utils.data import TensorDataset
import numpy
from numpy import hstack
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

from omegaconf import DictConfig
import hydra
import mlflow

from model.models import EnsembleLASSO
from datasets.memory import load_from_pgen, load_phenotype, load_covariates
from datasets.lightning import prepare_trainer
from sklearn.metrics import r2_score

from logging.config import dictConfig
import yaml



  data = yaml.load(f.read()) or {}
  import pandas.util.testing as tm
  defaults = yaml.load(f)


In [None]:
# @hydra.main(config_path='../configs/local', config_name='default')
def local_experiment(cfg: DictConfig):
    s = StandardScaler()

    X_train = s.fit_transform(hstack((load_from_pgen(cfg.data.genotype.train, cfg.data.gwas, snp_count=cfg.experiment.snp_count),
                      load_covariates(cfg.data.covariates.train))))
    X_val = s.transform(hstack((load_from_pgen(cfg.data.genotype.val, cfg.data.gwas, snp_count=cfg.experiment.snp_count),
                    load_covariates(cfg.data.covariates.val))))
    X_test = s.transform(hstack((load_from_pgen(cfg.data.genotype.test, cfg.data.gwas, snp_count=cfg.experiment.snp_count),
                     load_covariates(cfg.data.covariates.test))))
    
#     X_train = load_from_pgen(cfg.data.genotype.train, cfg.data.gwas, snp_count=cfg.experiment.snp_count)
#     X_val = load_from_pgen(cfg.data.genotype.val, cfg.data.gwas, snp_count=cfg.experiment.snp_count)
#     X_test = load_from_pgen(cfg.data.genotype.test, cfg.data.gwas, snp_count=cfg.experiment.snp_count)

#     X_train = load_covariates(cfg.data.covariates.train)
#     X_val = load_covariates(cfg.data.covariates.val)
#     X_test = load_covariates(cfg.data.covariates.test)

    y_train = load_phenotype(cfg.data.phenotype.train)
    y_val = load_phenotype(cfg.data.phenotype.val)
    y_test = load_phenotype(cfg.data.phenotype.test)    
    
    
    mlflow.set_experiment('local-models')
    with mlflow.start_run(tags={
                            'name': 'xgboost',
                            'type': 'local',
                            'split': cfg.split_dir.split('/')[-1],
                            'phenotype': cfg.phenotype.name,
                            'node_index': str(cfg.node_index),
                            'snp_count': str(cfg.experiment.snp_count),
                            'gwas_path': cfg.data.gwas
                            }
                        ) as run:
        
        model = XGBRegressor(max_depth=2)

        print("Training")
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
        y_preds_test = model.predict(X_test)
        y_preds_train = model.predict(X_train)
        y_preds_val = model.predict(X_val)
        
        train_r2 = r2_score(y_train, y_preds_train)
        test_r2 = r2_score(y_test, y_preds_test)
        val_r2 = r2_score(y_val, y_preds_val)
        
        mlflow.log_metric('train_r2', train_r2)
        mlflow.log_metric('val_r2', val_r2)
        mlflow.log_metric('test_r2', test_r2)
#         model = EnsembleLASSO(train_dataset, val_dataset, test_dataset=test_dataset, alpha_start=cfg.model.alpha_start, alpha_end=cfg.model.alpha_end, input_size=input_size, batch_size=cfg.model.batch_size,
#                               hidden_size=cfg.model.hidden_size)
#         trainer = prepare_trainer('models', 'logs', f'ensemble_lasso/{cfg.phenotype.name}', f'run{run.info.run_id}', gpus=cfg.experiment.gpus, precision=cfg.model.precision,
#                                     max_epochs=cfg.model.max_epochs, weights_summary='full', patience=10, log_every_n_steps=5)
#         trainer.fit(model)

#         best_model = EnsembleLASSO.load_from_checkpoint(trainer.checkpoint_callback.best_model_path,
#                                             train_dataset=train_dataset, val_dataset=val_dataset, 
#                                             input_size=input_size, batch_size=cfg.model.batch_size,
#                                             hidden_size=cfg.model.hidden_size, alpha_start=cfg.model.alpha_start, 
#                                             alpha_end=cfg.model.alpha_end, num_workers=1,
#                                             total_steps=cfg.model.max_epochs)

#         best_model.eval()

#         preds = best_model.predict(model.train_dataloader()).cpu().numpy()
#         val_preds = best_model.predict(model.val_dataloader()).cpu().numpy()
#         test_preds = best_model.predict(model.test_dataloader()).cpu().numpy()
#         max_val_r2 = 0.0
#         best_col = 0

#         ln_train_r2s = []
#         ln_val_r2s = []
#         for col in range(cfg.model.hidden_size):
#             train_r2 = r2_score(y_train, preds[:, col])
#             val_r2 = r2_score(y_val, val_preds[:, col])
#             ln_train_r2s.append(train_r2)
#             ln_val_r2s.append(val_r2)
#             if val_r2 > max_val_r2:
#                 max_val_r2 = val_r2
#                 best_col = col
#             print(f'for alpha {best_model.alphas[col]:.4f} train_r2 is {train_r2:.4f}, val_r2 is {val_r2:.4f}')
        
        
#         print(f'test r2 for best alpha: {r2_score(y_test, test_preds[:, best_col]):.4f}')
        
#         train_r2 = r2_score(y_train, preds[:, best_col])
#         val_r2 = r2_score(y_val, val_preds[:, best_col])
#         test_r2 = r2_score(y_test, test_preds[:, best_col])
        
#         mlflow.log_metric('train_r2', train_r2)
#         mlflow.log_metric('val_r2', val_r2)
#         mlflow.log_metric('test_r2', test_r2)
        
#         print(f'train_r2: {train_r2}')
#         print(f'val_r2: {val_r2}')
#         print(f'test_r2: {test_r2}')
#     model = LinearRegression()
#     print("Training")
#     model.fit(X_train, y_train)
#     y_preds_test = model.predict(X_test)
#     print(f"r2: {r2_score(y_test, y_preds_test)}")
    

if __name__ == '__main__':
    with open('configs/local/default.yaml', 'r') as f:
        cfg = DictConfig(yaml.load(f))
    local_experiment(cfg)

# Only covariates

In [2]:
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge, SGDRegressor, LinearRegression, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge

In [3]:
with open('configs/local/default.yaml', 'r') as f:
    cfg = DictConfig(yaml.load(f))

  


In [4]:
s = StandardScaler()

# Only keep age/sex
X_train = s.fit_transform(load_covariates(cfg.data.covariates.train)[:, -2:])
X_val = s.transform(load_covariates(cfg.data.covariates.val)[:, -2:])
X_test = s.transform(load_covariates(cfg.data.covariates.test)[:, -2:])

# X_train = load_from_pgen(cfg.data.genotype.train, cfg.data.gwas, snp_count=cfg.experiment.snp_count)
# X_val = load_from_pgen(cfg.data.genotype.val, cfg.data.gwas, snp_count=cfg.experiment.snp_count)
# X_test = load_from_pgen(cfg.data.genotype.test, cfg.data.gwas, snp_count=cfg.experiment.snp_count)


# X_train = s.fit_transform(hstack((load_from_pgen(cfg.data.genotype.train, cfg.data.gwas, snp_count=cfg.experiment.snp_count),
#                   load_covariates(cfg.data.covariates.train))))
# X_val = s.transform(hstack((load_from_pgen(cfg.data.genotype.val, cfg.data.gwas, snp_count=cfg.experiment.snp_count),
#                 load_covariates(cfg.data.covariates.val))))
# X_test = s.transform(hstack((load_from_pgen(cfg.data.genotype.test, cfg.data.gwas, snp_count=cfg.experiment.snp_count),
#                  load_covariates(cfg.data.covariates.test))))


y_train = load_phenotype(cfg.data.phenotype.train)
y_val = load_phenotype(cfg.data.phenotype.val)
y_test = load_phenotype(cfg.data.phenotype.test)

In [29]:
model = LinearRegression()
# model = ElasticNet()
# model = Lasso()

# model = SVR() # works
# model = BayesianRidge() # no
# model = SGDRegressor() # works
# model = KernelRidge() # no

# model = XGBRegressor(max_depth=2)

print("Training")
model.fit(X_train, y_train)
# model.fit(X_train, y_train, eval_set=(X_val, y_val))

y_preds_test = model.predict(X_test)
y_preds_train = model.predict(X_train)
print(f"train r2: {r2_score(y_train, y_preds_train)}")
print(f"test r2: {r2_score(y_test, y_preds_test)}")

Training
train r2: -102.1524850181854
test r2: -102.19441180843478
