# BioRxiv database from Rxivist

Rxivist full database (doi:10.5281/zenodo.2566421) at https://zenodo.org/record/2566421


I used the following query to export a copy of the Rxivist BioRxiv scrape.

```sqlite
SELECT 
    aa.article, 
    art.title,
    auth.id as author_id, 
    auth.name as author_name, 
    auth.institution, 
    art.doi, 
    art.collection, 
    art.posted
FROM prod.article_authors aa
JOIN prod.authors auth
	ON aa.author = auth.id
JOIN prod.articles art
	ON aa.article = art.id
```

In [1]:
import itertools

import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse
import tqdm
import xswap

import analysis

In [2]:
rxivist_df = pd.read_csv('../data/rxivist_result.csv.xz', compression='xz')

rxivist_df.head(2)

Unnamed: 0,article,title,author_id,author_name,institution,doi,collection,posted
0,386,Using the Wax moth larva Galleria mellonella i...,255104,Rafael Hernandez,Stony Brook University,10.1101/327015,microbiology,2018-05-21
1,386,Using the Wax moth larva Galleria mellonella i...,255107,Nicola Coyle,"University of Bath, Bath, Somerset, United Kin...",10.1101/327015,microbiology,2018-05-21


In [3]:
coauthor_df = (
    rxivist_df
    .drop(columns=['title', 'doi', 'institution'])
    .merge(
        rxivist_df
        .drop(columns=['title', 'doi', 'institution']), 
        on=['article', 'posted', 'collection'], how='outer')
    .query('author_id_x < author_id_y')
)

coauthor_df.head(2)

Unnamed: 0,article,author_id_x,author_name_x,collection,posted,author_id_y,author_name_y
1,386,255104,Rafael Hernandez,microbiology,2018-05-21,255107,Nicola Coyle
2,386,255104,Rafael Hernandez,microbiology,2018-05-21,255105,Elze Hesse


## Example

The first article shown has four authors. This should mean there are 6 coauthor relationships

In [4]:
rxivist_df.query('article == 2715').head(2)

Unnamed: 0,article,title,author_id,author_name,institution,doi,collection,posted
17338,2715,Cell Lineage and Communication Network Inferen...,258794,Shuxiong Wang,"University of California, Irvine",10.1101/168922,bioinformatics,2017-07-26
17339,2715,Cell Lineage and Communication Network Inferen...,227155,Qing Nie,"University of California, Irvine",10.1101/168922,bioinformatics,2017-07-26


In [5]:
coauthor_df.query('article == 2715')

Unnamed: 0,article,author_id_x,author_name_x,collection,posted,author_id_y,author_name_y
243332,2715,258794,Shuxiong Wang,bioinformatics,2017-07-26,387916,Matthew Karikomi
243334,2715,227155,Qing Nie,bioinformatics,2017-07-26,258794,Shuxiong Wang
243336,2715,227155,Qing Nie,bioinformatics,2017-07-26,387916,Matthew Karikomi
243342,2715,210887,Adam L MacLean,bioinformatics,2017-07-26,258794,Shuxiong Wang
243343,2715,210887,Adam L MacLean,bioinformatics,2017-07-26,227155,Qing Nie
243344,2715,210887,Adam L MacLean,bioinformatics,2017-07-26,387916,Matthew Karikomi


## Bioinformatics collaboration network

In [6]:
# Drop entire articles for the reconstruction task, rather than individual coauthor relationships
articles = sorted(set(rxivist_df['article']))

np.random.seed(0)
train_articles = set(np.random.choice(articles, size=int(0.7*len(articles)), replace=False))

# Feature computation and permutation can become expensive when there are many (~ >10_000) edges
# Therefore, we must pick a cutoff, 2017, and take only edges that exist <= 2017
train_cutoff = 2017

bioinformatics_df = (
    coauthor_df
    .query('collection == "bioinformatics"')
    .assign(
        year=lambda df: df['posted'].apply(lambda x: int(x[:4])),
        test_new=1,
        test_recon=lambda df: df['year'].apply(lambda x: int(x <= train_cutoff)),
        train=lambda df: df.apply(lambda row: int(row['test_recon'] and (row['article'] in train_articles)),
                                 axis=1)
    )
    .drop_duplicates(subset=['author_id_x', 'author_id_y'])
    .filter(items=['author_id_x', 'author_id_y', 'train', 'test_recon', 'test_new'])
)

bioinformatics_df.head(2)

Unnamed: 0,author_id_x,author_id_y,train,test_recon,test_new
213273,207955,231879,0,0,1
213274,207955,247298,0,0,1


In [7]:
edges = list(map(tuple, bioinformatics_df.query('train == 1')[['author_id_x', 'author_id_y']].values))

# Pick out the largest connected component of the graph
g = nx.from_edgelist(edges)
gc = max(nx.connected_component_subgraphs(g), key=len)
connected_edges = list(map(tuple, map(sorted, gc.edges)))

# Map nodes to unique integers (for XSwap)
mapped_edges, mapping, _ = xswap.preprocessing.map_str_edges(connected_edges, bipartite=False)
reversed_mapping = {v: k for k, v in mapping.items()}

# Create a matrix of train edges (for feature computation)
mat = analysis.edges_to_matrix(mapped_edges)

# Create source, target degree matrices
degree = np.repeat(mat.sum(axis=1), mat.shape[1], axis=1) \
       + np.repeat(mat.sum(axis=0), mat.shape[0], axis=0)

# Use only those nodes that are present in the training network
nodes = sorted(set(mapping.values()))
mapped_source, mapped_target = zip(*itertools.product(nodes, nodes))

df = (
    pd.DataFrame()
    .assign(
        mapped_source=mapped_source,
        mapped_target=mapped_target,
        source=lambda df: df['mapped_source'].map(reversed_mapping),
        target=lambda df: df['mapped_target'].map(reversed_mapping),
    )
    .merge(bioinformatics_df, left_on=['source', 'target'], right_on=['author_id_x', 'author_id_y'], how='left')
    .drop(columns=['author_id_x', 'author_id_y'])
    .fillna(0)
    .assign(
        source_degree=lambda df: df['mapped_source'].map(df.groupby('mapped_source')['train'].sum().to_dict()),
        target_degree=lambda df: df['mapped_target'].map(df.groupby('mapped_target')['train'].sum().to_dict()),
    )
)

df.head()

Unnamed: 0,mapped_source,mapped_target,source,target,train,test_recon,test_new,source_degree,target_degree
0,0,0,200080,200080,0.0,0.0,0.0,10.0,0.0
1,0,1,200080,200108,0.0,0.0,0.0,10.0,0.0
2,0,2,200080,200126,0.0,0.0,0.0,10.0,1.0
3,0,3,200080,200148,0.0,0.0,0.0,10.0,0.0
4,0,4,200080,200213,0.0,0.0,0.0,10.0,2.0


In [8]:
# Compute features on unpermuted network
feature_mats = {
    'prior_empirical': np.zeros(mat.shape),
    
    'rwr': analysis.invertible_rwr(mat.toarray(), 0.25),
    'mean_rwr': np.zeros(mat.shape),
    'p_rwr': np.zeros(mat.shape),
    
    'jaccard': (mat@mat) / (degree - mat@mat),
    'mean_jaccard': np.zeros(mat.shape),
    'p_jaccard': np.zeros(mat.shape),
}

# Compute RWR p-value
n_perms = 1000
perm_edges = mapped_edges.copy()
for i in tqdm.tnrange(n_perms):
    # Permute edges
    perm_edges, _ = xswap.permute_edge_list(perm_edges, allow_self_loops=False, 
                                            allow_antiparallel=False, seed=i)
    perm_mat = analysis.edges_to_matrix(perm_edges)
    feature_mats['prior_empirical'] += perm_mat
    
    # Compute RWR on permuted network
    perm_rwr = analysis.invertible_rwr(perm_mat.toarray(), 0.25)
    feature_mats['mean_rwr'] += perm_rwr
    feature_mats['p_rwr'] += (perm_rwr >= feature_mats['rwr'])
    
    # Compute Jaccard similarity on permuted network
    A2 = perm_mat@perm_mat
    perm_jac = A2 / (degree - A2)
    feature_mats['mean_jaccard'] += perm_jac
    feature_mats['p_jaccard'] += (perm_jac >= feature_mats['jaccard'])

# Normalize features to number of permutations
for feature in ['mean_rwr', 'p_rwr', 'mean_jaccard', 'p_jaccard', 'prior_empirical']:
    feature_mats[feature] = feature_mats[feature] / n_perms

# Add computed features to the DataFrame
for feature, values in feature_mats.items():
    if scipy.sparse.issparse(values):
        df[feature] = values.toarray().flatten()
    else:
        df[feature] = np.array(values).flatten()

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [9]:
%%time

df.to_csv('biorxiv_p_vs_rank.tsv.xz', compression='xz', sep='\t', index=False)

CPU times: user 9min 43s, sys: 823 ms, total: 9min 44s
Wall time: 9min 44s
