In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
✔ tidyr   0.8.3       ✔ stringr 1.4.0  
✔ readr   1.3.1       ✔ forcats 0.4.0  
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
ppi_df <- read_tsv('../../data/4.data/ppi.tsv.xz')
tftg_df <- read_tsv('../../data/4.data/tftg.tsv.xz')
biorxiv_df <- read_tsv('../../data/4.data/biorxiv.tsv.xz')

Parsed with column specification:
cols(
  id_a = col_double(),
  id_b = col_double(),
  network = col_character(),
  edge = col_double(),
  edge_prior = col_double(),
  rwr = col_double(),
  mean_rwr = col_double(),
  p_rwr = col_double(),
  jaccard = col_double(),
  mean_jaccard = col_double(),
  p_jaccard = col_double()
)
Parsed with column specification:
cols(
  id_a = col_double(),
  id_b = col_double(),
  network = col_character(),
  edge = col_double(),
  edge_prior = col_double(),
  inf = col_double(),
  mean_inf = col_double(),
  p_inf = col_double()
)
Parsed with column specification:
cols(
  id_a = col_double(),
  id_b = col_double(),
  network = col_character(),
  edge = col_double(),
  edge_prior = col_double(),
  rwr = col_double(),
  mean_rwr = col_double(),
  p_rwr = col_double(),
  jaccard = col_double(),
  mean_jaccard = col_double(),
  p_jaccard = col_double()
)


In [3]:
head(tftg_df, 2)

id_a,id_b,network,edge,edge_prior,inf,mean_inf,p_inf
0,0,train,0,0,0,0,1
0,1,train,0,0,0,0,1


In [4]:
head(biorxiv_df, 2)

id_a,id_b,network,edge,edge_prior,rwr,mean_rwr,p_rwr,jaccard,mean_jaccard,p_jaccard
0,2,train,0,0,0,7.536416e-05,1,0,0.001947368,1
0,3,train,0,0,0,6.602172e-05,1,0,0.0016,1


In [5]:
head(ppi_df, 2)

id_a,id_b,network,edge,edge_prior,rwr,mean_rwr,p_rwr,jaccard,mean_jaccard,p_jaccard
0,0,train,0,0,0.2516038,0.2512483655,0.038,1,1.0,1
0,1,train,0,0,1.582702e-05,1.45586e-05,0.088,0,0.001491855,1


In [6]:
# Task 1: Dropped -> biased ("Reconstruction")

recon_plot_df <- bind_rows(
    tftg_df %>%
    filter(network == 'train') %>%
    left_join(
        tftg_df %>%
            filter(network == 'test_recon') %>%
            select(id_a, id_b, edge),
        by = c("id_a", "id_b"), suffix = c("_train", "_test")
    ) %>%
    filter(edge_train == 0) %>%
    select(-id_a, -id_b, -network, -edge_train) %>%
    gather('feature', 'value', edge_prior:p_inf) %>%
    mutate(
        feature_information = ifelse(grepl('edge_prior|mean_', feature), 'no edge info', 
                          ifelse(grepl('p_', feature), 'corrected feature', 'feature')) %>%
            factor(levels=c('feature', 'no edge info', 'corrected feature')),
        feature_group = str_extract(feature, 'prior|rwr|jaccard|inf'),
        data_source = 'TFTG'
    ), 
    
    ppi_df %>%
    filter(network == 'train') %>%
    sample_frac(0.5) %>%
    left_join(
        ppi_df %>%
            filter(network == 'test_recon') %>%
            select(id_a, id_b, edge),
        by = c("id_a", "id_b"), suffix = c("_train", "_test")
    ) %>%
    filter(edge_train == 0) %>%
    select(-id_a, -id_b, -network, -edge_train) %>%
    gather('feature', 'value', edge_prior:p_jaccard) %>%
    mutate(
        feature_information = ifelse(grepl('edge_prior|mean_', feature), 'no edge info', 
                          ifelse(grepl('p_', feature), 'corrected feature', 'feature')) %>%
            factor(levels=c('feature', 'no edge info', 'corrected feature')),
        feature_group = str_extract(feature, 'prior|rwr|jaccard|inf'),
        data_source = 'PPI'
    ), 
    
    biorxiv_df %>%
    filter(network == 'train') %>%
    sample_frac(0.5) %>%
    left_join(
        biorxiv_df %>%
            filter(network == 'test_recon') %>%
            select(id_a, id_b, edge),
        by = c("id_a", "id_b"), suffix = c("_train", "_test")
    ) %>%
    filter(edge_train == 0) %>%
    select(-id_a, -id_b, -network, -edge_train) %>%
    gather('feature', 'value', edge_prior:p_jaccard) %>%
    mutate(
        feature_information = ifelse(grepl('edge_prior|mean_', feature), 'no edge info', 
                          ifelse(grepl('p_', feature), 'corrected feature', 'feature')) %>%
            factor(levels=c('feature', 'no edge info', 'corrected feature')),
        feature_group = str_extract(feature, 'prior|rwr|jaccard|inf'),
        data_source = 'BioRxiv'
    )
)

In [7]:
head(recon_plot_df, 2)

edge_test,feature,value,feature_information,feature_group,data_source
0,edge_prior,0,no edge info,prior,TFTG
0,edge_prior,0,no edge info,prior,TFTG


In [8]:
# Task 2: Biased -> systematic

new_plot_df <- bind_rows(
    tftg_df %>%
    filter(network == 'test_recon') %>%
    left_join(
        tftg_df %>%
            filter(network == 'test_new') %>%
            select(id_a, id_b, edge),
        by = c("id_a", "id_b"), suffix = c("_train", "_test")
    ) %>%
    filter(edge_train == 0) %>%
    select(-id_a, -id_b, -network, -edge_train) %>%
    gather('feature', 'value', edge_prior:p_inf) %>%
    mutate(
        feature_information = ifelse(grepl('edge_prior|mean_', feature), 'no edge info', 
                          ifelse(grepl('p_', feature), 'corrected feature', 'feature')) %>%
            factor(levels=c('feature', 'no edge info', 'corrected feature')),
        feature_group = str_extract(feature, 'prior|rwr|jaccard|inf'),
        data_source = 'TFTG'
    ), 
    
    ppi_df %>%
    filter(network == 'test_recon') %>%
    sample_frac(0.5) %>%
    left_join(
        ppi_df %>%
            filter(network == 'test_new') %>%
            select(id_a, id_b, edge),
        by = c("id_a", "id_b"), suffix = c("_train", "_test")
    ) %>%
    filter(edge_train == 0) %>%
    select(-id_a, -id_b, -network, -edge_train) %>%
    gather('feature', 'value', edge_prior:p_jaccard) %>%
    mutate(
        feature_information = ifelse(grepl('edge_prior|mean_', feature), 'no edge info', 
                          ifelse(grepl('p_', feature), 'corrected feature', 'feature')) %>%
            factor(levels=c('feature', 'no edge info', 'corrected feature')),
        feature_group = str_extract(feature, 'prior|rwr|jaccard|inf'),
        data_source = 'PPI'
    ), 
    
    biorxiv_df %>%
    filter(network == 'test_recon') %>%
    sample_frac(0.5) %>%
    left_join(
        biorxiv_df %>%
            filter(network == 'test_new') %>%
            select(id_a, id_b, edge),
        by = c("id_a", "id_b"), suffix = c("_train", "_test")
    ) %>%
    filter(edge_train == 0) %>%
    select(-id_a, -id_b, -network, -edge_train) %>%
    gather('feature', 'value', edge_prior:p_jaccard) %>%
    mutate(
        feature_information = ifelse(grepl('edge_prior|mean_', feature), 'no edge info', 
                          ifelse(grepl('p_', feature), 'corrected feature', 'feature')) %>%
            factor(levels=c('feature', 'no edge info', 'corrected feature')),
        feature_group = str_extract(feature, 'prior|rwr|jaccard|inf'),
        data_source = 'BioRxiv'
    )   
)

In [9]:
rm(ppi_df)
rm(biorxiv_df)
rm(tftg_df)

plot_df <- bind_rows(
    recon_plot_df %>% mutate(task = 'reconstruction'),
    new_plot_df %>% mutate(task = 'systematic')
)

rm(recon_plot_df)
rm(new_plot_df)

head(plot_df, 2)

edge_test,feature,value,feature_information,feature_group,data_source,task
0,edge_prior,0,no edge info,prior,TFTG,reconstruction
0,edge_prior,0,no edge info,prior,TFTG,reconstruction


In [10]:
write_tsv(path = 'plot_info.tsv', x = plot_df)