In [1]:
import networkx as nx
import numpy as np
import pandas as pd

import analysis

# BioRxiv collaboration network

DOI: 10.1101/515643

Rxivist full database (doi:10.5281/zenodo.2566421) at https://zenodo.org/record/2566421


I used the following query to export a copy of the Rxivist BioRxiv scrape.

```sqlite
SELECT 
    aa.article, 
    art.title,
    auth.id as author_id, 
    auth.name as author_name, 
    auth.institution, 
    art.doi, 
    art.collection, 
    art.posted
FROM prod.article_authors aa
JOIN prod.authors auth
	ON aa.author = auth.id
JOIN prod.articles art
	ON aa.article = art.id
```

In [2]:
rxivist_df = (
    pd.read_csv('../data/1.raw/citations.csv')
    .dropna()
    .assign(
        year=lambda df: df['posted'].apply(lambda x: int(x[:4])),
    )
    .query('collection == "bioinformatics"')
    .drop(columns=['title', 'author_id', 'institution', 'doi', 'posted', 'collection'])
)

rxivist_df.head(2)

Unnamed: 0,article,author_name,year
0,2715,Shuxiong Wang,2017
422,5373,Filip Ter,2015


In [3]:
train_cutoff = 2017

# Self-join to create author-author relationships from author-article relationships
biorxiv_edges_df = (
    rxivist_df
    .merge(rxivist_df, on=['article', 'year'])
    .rename(columns={'author_name_x': 'name_a', 'author_name_y': 'name_b'})
    # Remove reversed and self-edges
    .query('name_a < name_b')
    # Only take the first co-authored paper between two authors
    .groupby(['name_a', 'name_b'])[['year', 'article']].min()
    .reset_index()
    .assign(
        test_recon=lambda df: df['year'].apply(lambda x: x <= train_cutoff).astype(int),
        test_new=1,
    )
)

# Assert that no duplicate edges appear whatsoever (causes an issue with XSwap and doesn't make sense)
assert biorxiv_edges_df.groupby(['name_a', 'name_b'])['test_new'].sum().max() == 1

# Subset to largest connected component
name_edges = list(map(tuple, biorxiv_edges_df.query('test_recon == 1')[['name_a', 'name_b']].values))
G = nx.from_edgelist(name_edges)
Gc = max(nx.connected_component_subgraphs(G), key=len)
name_edges = list(map(tuple, map(sorted, Gc.edges)))

np.random.seed(0)
biorxiv_edges_df = (
    pd.DataFrame(name_edges, columns=['name_a', 'name_b'])
    .merge(
        biorxiv_edges_df, how='left', on=['name_a', 'name_b']
    )
    .assign(
        train=lambda df: df['test_recon'].apply(lambda x: x and np.random.rand() < 0.7).astype(int)
    )
)

# Create a node mapping
authors = sorted(set(biorxiv_edges_df.query('train == 1').loc[:, ['name_a', 'name_b']].values.flatten()))
biorxiv_mapping = {name: i for name, i in zip(authors, range(len(authors)))}
biorxiv_reversed = {v: k for k, v in biorxiv_mapping.items()}

# Apply node mapping and reorder edges so id_a < id_b
biorxiv_edges_df = (
    biorxiv_edges_df
    .assign(
        mapped_a=lambda df: df['name_a'].map(biorxiv_mapping),
        mapped_b=lambda df: df['name_b'].map(biorxiv_mapping),
    )
    .dropna()
    .assign(
        id_a=lambda df: df.apply(lambda row: min(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        id_b=lambda df: df.apply(lambda row: max(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        name_a=lambda df: df['id_a'].map(biorxiv_reversed),
        name_b=lambda df: df['id_b'].map(biorxiv_reversed),
    )
    .reset_index(drop=True)
    .filter(items=['name_a', 'name_b', 'id_a', 'id_b', 'train', 'test_recon', 'test_new'])
)

assert biorxiv_edges_df.groupby(['name_a', 'name_b']).size().max() == 1
assert biorxiv_edges_df.groupby(['id_a', 'id_b']).size().max() == 1

biorxiv_edges_df.to_csv('../data/2.edges/biorxiv.tsv.xz', compression='xz', index=False, sep='\t')

biorxiv_edges_df.head(2)

Unnamed: 0,name_a,name_b,id_a,id_b,train,test_recon,test_new
0,Alexander Konovalov,Timo Sachsenberg,117,4167,1,1,1
1,Christian Fufezan,Timo Sachsenberg,729,4167,0,1,1


In [4]:
%%time

biorxiv_df = analysis.process_edges_to_full_network(biorxiv_edges_df, biorxiv_mapping, allow_loop=False, directed=False)
biorxiv_df.to_csv('../data/3.all_nodes/biorxiv.tsv.xz', compression='xz', index=False, sep='\t')

biorxiv_df.head(2)

CPU times: user 8min 35s, sys: 4.44 s, total: 8min 39s
Wall time: 8min 39s
