## LearnR

Note that metapath names with `<` are [currently truncated](https://github.com/IRkernel/IRkernel/issues/286) in the notebook, unless they are specially HTML escaped.

In [None]:
library(dplyr, warn=F)

In [None]:
set.seed(0)

## Read datasets

In [None]:
dwpc_mat_df = readr::read_tsv('data/matrix/hetio-ind/DWPC-spread.tsv.bz2')

In [None]:
auroc_df = readr::read_tsv('data/auroc.tsv')
#auroc_df = auroc_df %>% dplyr::filter(fdr_pval_auroc <= 0.05)
head(auroc_df, 2)

## Weighting

In [None]:
n_compounds = readr::read_tsv('../summary/compounds.tsv') %>% nrow()
n_diseases = readr::read_tsv('../summary/diseases.tsv') %>% nrow()
n_pairs = n_compounds * n_diseases
n_positives = sum(dwpc_mat_df$status == 1)
n_negatives = sum(dwpc_mat_df$status == 0)
c(n_pairs, n_positives, n_negatives)

In [None]:
positive_weight = 1
negative_weight = (n_pairs - n_positives) / n_negatives
weight_map = list('0' = negative_weight, '1' = positive_weight)
#weight_map = list('0' = 5, '1' = positive_weight)
weight_map

In [None]:
head(dwpc_mat_df, 2)

In [None]:
features = auroc_df$feature
X = dwpc_mat_df %>%
  dplyr::select(one_of(features)) %>%
  as.matrix()
y = dwpc_mat_df$status
w = as.numeric(weight_map[as.character(y)])
sprintf("%s compound–disease pairs × %s features", nrow(X), ncol(X))

## Train model

Weights are currently not working due to an error thrown by `glmnet::cv.glmnet` (presumably [this line](https://github.com/cran/glmnet/blob/b8b39029eae71958e9c7c382240b7696fde3eff1/R/cv.lognet.R#L53)):

```
Error in predmat[which, seq(nlami)] = preds: replacement has length zero
```

Thus logistic regression model is fit without weights.

## Parameter Sweep

In [None]:
# param_df = dplyr::data_frame(seed = rep(1:10, each=2), measure = rep(c('deviance', 'auc'), 10))
# param_df %>%
#   dplyr::do()

In [None]:
results = list()
i = 1
for (measure in c('deviance', 'auc')) {
  for (seed in 1:10) {
    elem = list(seed = seed, measure = measure)
    elem$fit = hetior::glmnet_train(X = X, y = y, alpha = 1, cores=12, type.measure=measure, seed=seed)
    elem$coef_df = elem$fit$coef_df %>%
      dplyr::filter(zcoef != 0) %>%
      dplyr::mutate(seed = seed, measure = measure)
    elem$pos_coefs = sum(elem$coef_df$zcoef > 0)
    elem$neg_coefs = sum(elem$coef_df$zcoef < 0)
    results[[i]] = elem
    i = i + 1
  }
}
length(results)

In [None]:
sweep_summary_df = do.call(rbind, lapply(results, function(x) {dplyr::data_frame(
    seed = x$seed,
    measure = x$measure,
    auroc = x$fit$vtm$auroc,
    pos_coefs = x$pos_coefs,
    neg_coefs = x$neg_coefs
)}))
sweep_summary_df

In [None]:
sweep_summary_df %>%
  dplyr::group_by(measure) %>%
  dplyr::summarize(mean(auroc))

In [None]:
sweep_coef_df = do.call(rbind, lapply(results, function(x) {x$coef_df}))
sweep_feature_df = sweep_coef_df %>%
  dplyr::group_by(feature, measure) %>%
  dplyr::summarize(
    count = n()
  ) %>%
  dplyr::ungroup() %>%
  tidyr::spread(measure, count, fill=0) %>%
  dplyr::mutate(total = auc + deviance) %>%
  dplyr::arrange(desc(deviance)) %>%
  dplyr::left_join(auroc_df)
head(sweep_feature_df, 2)

In [None]:
sweep_feature_df %>% readr::write_tsv('selection/sweep-features.tsv')
sweep_coef_df %>% readr::write_tsv('selection/sweep-coefficients.tsv')
sweep_summary_df %>% readr::write_tsv('selection/sweep-model-summaries.tsv')

In [None]:
# Unique features
nrow(sweep_feature_df)

## Fit single model

In [None]:
fit = hetior::glmnet_train(X = X, y = y, alpha = 1, cores=12)

In [None]:
# coef_df = fit$coef_df %>%
#   dplyr::filter(coef != 0) %>%
#   dplyr::left_join(auroc_df)

In [None]:
# table(coef_df$zcoef %>% sign())

In [None]:
# coef_df %>%
#   dplyr::mutate(feature = htmltools::htmlEscape(feature)) %>%
#   dplyr::arrange(zcoef)

In [None]:
fit$vtm$auroc

In [None]:
fit$vtm$auprc

In [None]:
hetior::get_tjur(y_true = fit$y, y_pred = fit$y_pred)

In [None]:
pred_df = dwpc_mat_df[1:7]
pred_df$prediction = fit$y_pred
head(pred_df)

In [None]:
pred_df %>%
  ggplot2::ggplot(ggplot2::aes(prediction)) +
  ggplot2::geom_histogram(binwidth=0.01)

In [None]:
pred_df %>%
  readr::write_tsv('data/predictions.tsv')

In [None]:
tail(arrange(pred_df, prediction), 50)