## LearnR

Note that metapath names with `<` are [currently truncated](https://github.com/IRkernel/IRkernel/issues/286) in the notebook, unless they are specially HTML escaped.

In [1]:
library(dplyr, warn=F)

## Read datasets

In [2]:
dwpc_mat_df = readr::read_tsv('data/matrix/hetio-ind/features.tsv.bz2')

In [3]:
auroc_df = readr::read_tsv('data/auroc.tsv')
perm_affected = dplyr::filter(auroc_df, feature_type == 'degree' | fdr_pval_auroc <= 0.05)$feature
head(auroc_df, 2)

Unnamed: 0,feature_type,feature,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
1,degree,CbG,0.92238,0.67508,0.67481,0.00027521,0.92609,0.94922
2,DWPC,CbG0.624640.785150.68870.0964470.00173810.015367,,,,,,


In [4]:
length(perm_affected)

## Weighting

Weights are currently not working due to an error thrown by `glmnet::cv.glmnet` (presumably [this line](https://github.com/cran/glmnet/blob/b8b39029eae71958e9c7c382240b7696fde3eff1/R/cv.lognet.R#L53)):

```
Error in predmat[which, seq(nlami)] = preds: replacement has length zero
```

Thus logistic regression models are fit without weights.

In [5]:
n_compounds = readr::read_tsv('../summary/compounds.tsv') %>% nrow()
n_diseases = readr::read_tsv('../summary/diseases.tsv') %>% nrow()
n_pairs = n_compounds * n_diseases
n_positives = sum(dwpc_mat_df$status == 1)
n_negatives = sum(dwpc_mat_df$status == 0)
c(n_pairs, n_positives, n_negatives)

In [6]:
positive_weight = 1
negative_weight = (n_pairs - n_positives) / n_negatives
weight_map = list('0' = negative_weight, '1' = positive_weight)
weight_map

## Dataset preparation

In [7]:
head(dwpc_mat_df, 2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,status,CbG,CbGCbGCbGCbGellip.hCuGuDuGdDCuGuDuGuDDaGDdGDlADpCDpSDrDDtCDuG,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
1,DB01048,Abacavir,DOID:635,acquired immunodeficiency syndrome,1,3,4.95115e-05,0.000673428,0.000678888,0,⋯,0,0,40,250,25,0,24,2,14,250
2,DB01048,Abacavir,DOID:1459,hypothyroidism,0,3,9.89203e-05,0.0,0.0,0,⋯,0,0,44,0,24,0,34,3,4,0


In [8]:
features = auroc_df$feature
X_list = list()
X_list$all_features = X = dwpc_mat_df %>%
  dplyr::select(one_of(features)) %>%
  as.matrix()
X_list$perm_affected = dwpc_mat_df %>%
  dplyr::select(one_of(perm_affected)) %>%
  as.matrix()
y = dwpc_mat_df$status
w = as.numeric(weight_map[as.character(y)])
sprintf("%s compound–disease pairs × %s features", nrow(X), ncol(X))

## Parameter Sweep

In [9]:
n_runs = 50

In [10]:
results = list()
i = 1
for (feature_set in names(X_list)) {
  for (seed in 1:n_runs) {
    elem = list(seed = seed, feature_set = feature_set)
    elem$fit = hetior::glmnet_train(X = X_list[[feature_set]], y = y, alpha = 1, cores=12, seed=seed)
    elem$coef_df = elem$fit$coef_df %>%
      dplyr::filter(zcoef != 0) %>%
      dplyr::mutate(seed = seed, feature_set = feature_set)
    elem$pos_coefs = sum(elem$coef_df$zcoef > 0)
    elem$neg_coefs = sum(elem$coef_df$zcoef < 0)
    results[[i]] = elem
    i = i + 1
  }
}
length(results)

Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-5



In [11]:
sweep_summary_df = do.call(rbind, lapply(results, function(x) {dplyr::data_frame(
    seed = x$seed,
    feature_set = x$feature_set,
    auroc = x$fit$vtm$auroc,
    pos_coefs = x$pos_coefs,
    neg_coefs = x$neg_coefs
)}))
sweep_summary_df

Unnamed: 0,seed,feature_set,auroc,pos_coefs,neg_coefs
1,1,all_features,0.989614929169766,35,19
2,2,all_features,0.988926362878815,32,17
3,3,all_features,0.990268847857543,36,24
4,4,all_features,0.988254023946315,28,13
5,5,all_features,0.988926362878815,32,17
6,6,all_features,0.988926362878815,32,17
7,7,all_features,0.990268847857543,36,24
8,8,all_features,0.989614929169766,35,19
9,9,all_features,0.990883294592336,36,27
10,10,all_features,0.989614929169766,35,19


In [12]:
sweep_summary_df %>%
  dplyr::group_by(feature_set) %>%
  dplyr::summarize(
    mean = mean(auroc),
    sd = sd(auroc)
  )

Unnamed: 0,feature_set,mean,sd
1,all_features,0.9896413,0.001066166
2,perm_affected,0.9881841,0.0007751641


In [13]:
p_all_features = subset(sweep_summary_df, feature_set == 'all_features')$auroc
p_perm_affected = subset(sweep_summary_df, feature_set == 'perm_affected')$auroc
t.test(p_all_features, p_perm_affected)


	Welch Two Sample t-test

data:  p_all_features and p_perm_affected
t = 7.8172, df = 89.49, p-value = 9.992e-12
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.001086879 0.001827642
sample estimates:
mean of x mean of y 
0.9896413 0.9881841 


In [14]:
sweep_coef_df = do.call(rbind, lapply(results, function(x) {x$coef_df}))
head(sweep_coef_df, 2)

Unnamed: 0,feature,coef,zcoef,seed,feature_set
1,CbGaD,23.34766,0.7210349,1,all_features
2,CbGaDdGuD,48.43551,0.2136418,1,all_features


In [15]:
sweep_feature_df = sweep_coef_df %>%
  dplyr::group_by(feature, feature_set) %>%
  dplyr::summarize(
    count = n()
  ) %>%
  dplyr::ungroup() %>%
  tidyr::spread(feature_set, count, fill=0) %>%
  dplyr::mutate(total = all_features + perm_affected) %>%
  dplyr::arrange(desc(all_features)) %>%
  dplyr::left_join(auroc_df)
head(sweep_feature_df, 2)

Joining by: "feature"


Unnamed: 0,feature,all_features,perm_affected,total,feature_type,nonzero,auroc,auroc_permuted,delta_auroc,pval_auroc,fdr_pval_auroc
1,CbGaD,50,50,100,DWPC,0.23311,0.75173,0.64228,0.10944,8.416e-06,0.001514
2,CbGaDdGuD,50,0,50,DWPC,0.27709,0.65788,0.6515,0.0063834,0.31739,0.42561


In [16]:
# Unique features
nrow(sweep_feature_df)

In [17]:
sweep_feature_df %>% readr::write_tsv('selection/sweep-features.tsv')
sweep_coef_df %>% readr::write_tsv('selection/sweep-coefficients.tsv')
sweep_summary_df %>% readr::write_tsv('selection/sweep-model-summaries.tsv')