# Fit the final model and make predictions

In [1]:
library(dplyr, warn=F)

In [2]:
selected_df = 'features/selected-features.tsv' %>%
  readr::read_tsv()
metapaths = dplyr::filter(selected_df, feature_type == 'dwpc')$feature
degrees = dplyr::filter(selected_df, feature_type == 'degree')$feature
col_types = list()
for (metapath in metapaths) {
  col_types[[metapath]] = readr::col_number()
}

In [3]:
feature_df = readr::read_tsv('features/features.tsv.bz2', col_types = col_types) %>%
  dplyr::mutate(prior_logit = boot::logit(prior_prob))

In [4]:
head(feature_df, 2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,CbG,CcSE,CdG,ellip.h,CtDrDuGaD,CtDtCbGaD,CtDtCtD,CtDtCuGaD,CtDuGcGuD,CuGaDpCtD,CuGiGuCpD,CuGuD,CuGuDpCtD,prior_logit
1,DB01048,Abacavir,DOID:10652,Alzheimer's disease,,0,0.004753,3,136,0,⋯,0,0.002638,0,0.0007788,0.00739,0,0,0,0,-5.344215
2,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,,0,0.004753,8,71,0,⋯,0,0.005065,0,0.009089,0.001119,0,0,0,0,-5.344215


In [5]:
for (feature in degrees) {
    feature_df[[feature]] = asinh(feature_df[[feature]])
}

for (feature in metapaths) {
    x = feature_df[[feature]]
    feature_df[[feature]] = asinh(x / mean(x))
}

In [6]:
train_df = feature_df %>%
  dplyr::filter(prior_prob > 0)
  #dplyr::filter(!(category %in% 'SYM'))
dim(train_df)

In [7]:
X_train = train_df %>%
  dplyr::select(prior_logit, one_of(selected_df$feature)) %>%
  as.matrix()
dim(X_train)
y_train = train_df$status
table(y_train)

y_train
    0     1 
29044   755 

In [None]:
penalty_factor = as.numeric(colnames(X_train) != 'prior_logit')
fit = hetior::glmnet_train(X = X_train, y = y_train, alpha = 0, s = 'lambda.1se', cores = 12, seed = 0,
    penalty.factor=penalty_factor, lambda.min.ratio=1e-6, nlambda=200)

In [None]:
par(mfrow = c(2,1), oma = c(5,4,0,0) + 0.1, mar = c(0,0,1,1) + 0.1)
plot(fit$cv_model$glmnet.fit, xvar = "lambda", label = TRUE)
plot(fit$cv_model)
fit$cv_model$lambda.1se

In [None]:
train_pred_df = train_df %>%
  dplyr::select(compound_id, disease_id) %>%
  dplyr::bind_cols(dplyr::data_frame(training_prediction = fit$y_pred))

In [None]:
fit$coef_df %>% readr::write_tsv('model/coefficient.tsv')

In [None]:
# training performance
fit$vtm[c('auroc', 'auprc', 'tjur')]

In [None]:
# prior performance on training
vtm = hetior::calc_vtms(y_true = train_df$status, y_pred = train_df$prior_prob, T)
vtm[c('auroc', 'auprc', 'tjur')]

In [None]:
# Prevalence of positives as percent
y_true = feature_df$status
scales::percent(mean(y_true))

In [None]:
# prior performance on all observations
vtm = hetior::calc_vtms(y_true = feature_df$status, y_pred = feature_df$prior_prob)
vtm[c('auroc', 'auprc')]

In [None]:
sum(feature_df$prior_prob)

In [None]:
X_all = feature_df %>%
  dplyr::mutate(prior_logit = boot::logit(mean(y_true))) %>%
  dplyr::select(one_of(colnames(X_train))) %>%
  as.matrix()
y_pred = hetior::glmnet_predict(fit$cv_model, X = X_all, s = 'lambda.1se')

In [None]:
predict_df = feature_df %>%
  dplyr::select(compound_id:category, status) %>%
  dplyr::mutate(prediction = y_pred) %>%
  dplyr::left_join(train_pred_df)

In [None]:
# Prediction performance
vtm = hetior::calc_vtms(y_true = predict_df$status, y_pred = predict_df$prediction, T)
vtm[c('auroc', 'auprc', 'tjur')]

In [None]:
predict_df %>%
  dplyr::arrange(desc(prediction)) %>%
  head(5)

In [None]:
predict_df %>% readr::write_tsv('predictions/probabilities.tsv', na = '')