In [1]:
import sys

import networkx as nx
import numpy as np
import pandas as pd

sys.path.insert(0, '../../')

import analysis

# BioRxiv collaboration network

DOI: 10.1101/515643

Rxivist full database (doi:10.5281/zenodo.2566421) at https://zenodo.org/record/2566421


I used the following query to export a copy of the Rxivist BioRxiv scrape.

```sqlite
SELECT 
    aa.article, 
    art.title,
    auth.id as author_id, 
    auth.name as author_name, 
    auth.institution, 
    art.doi, 
    art.collection, 
    art.posted
FROM prod.article_authors aa
JOIN prod.authors auth
	ON aa.author = auth.id
JOIN prod.articles art
	ON aa.article = art.id
```

In [2]:
rxivist_df = (
    pd.read_csv('../../../data/task3/1.raw/citations.csv')
    .dropna()
    .assign(
        year=lambda df: df['posted'].apply(lambda x: int(x[:4])),
    )
    .query('collection == "bioinformatics"')
    .drop(columns=['title', 'author_id', 'institution', 'doi', 'posted', 'collection'])
)

rxivist_df.head(2)

FileNotFoundError: [Errno 2] File b'../../../data/task3/1.raw/citations.csv' does not exist: b'../../../data/task3/1.raw/citations.csv'

In [3]:
train_cutoff = 2017

biorxiv_edges_df = (
    rxivist_df
    .merge(rxivist_df, on=['article', 'year'])
    .rename(columns={'author_name_x': 'name_a', 'author_name_y': 'name_b'})
    .query('name_a < name_b')
    .groupby(['name_a', 'name_b'])[['year']].min()
    .reset_index()
)

# Assert that no duplicate edges appear whatsoever (causes an issue with XSwap and doesn't make sense)
assert biorxiv_edges_df.groupby(['name_a', 'name_b']).size().max() == 1

# Subset to largest connected component
name_edges = list(map(tuple, biorxiv_edges_df[['name_a', 'name_b']].values))
G = nx.from_edgelist(name_edges)
Gc = max(nx.connected_component_subgraphs(G), key=len)
# name_edges = list(map(tuple, map(sorted, Gc.edges)))

subset_nodes = set(Gc.nodes)

biorxiv_edges_df = (
    biorxiv_edges_df
    .loc[biorxiv_edges_df['name_a'].isin(subset_nodes) & biorxiv_edges_df['name_b'].isin(subset_nodes), :]
)



# Create a node mapping
biorxiv_mapping = {name: i for name, i in zip(subset_nodes, range(len(subset_nodes)))}
biorxiv_reversed = {v: k for k, v in biorxiv_mapping.items()}

# Apply node mapping and reorder edges so id_a < id_b
np.random.seed(0)
biorxiv_edges_df = (
    biorxiv_edges_df
    .assign(
        test_new=1,
        test_recon=lambda df: (df['year'] <= train_cutoff).astype(int),
        train=lambda df: df['test_recon'].apply(lambda x: x == 1 and np.random.rand() < 0.7).astype(int),
        mapped_a=lambda df: df['name_a'].map(biorxiv_mapping),
        mapped_b=lambda df: df['name_b'].map(biorxiv_mapping),
    )
    .dropna()
    .assign(
        id_a=lambda df: df.apply(lambda row: min(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        id_b=lambda df: df.apply(lambda row: max(row['mapped_a'], row['mapped_b']), axis=1).astype(int),
        name_a=lambda df: df['id_a'].map(biorxiv_reversed),
        name_b=lambda df: df['id_b'].map(biorxiv_reversed),
    )
    .reset_index(drop=True)
    .filter(items=['name_a', 'name_b', 'id_a', 'id_b', 'train', 'test_recon', 'test_new'])
)

nodes_with_recon_edge = set(
    biorxiv_edges_df
    .groupby('id_a')['test_recon']
    .sum()
    .reset_index()
    .query('test_recon > 0')
    .loc[:, 'id_a']
    .values
).union(set(
    biorxiv_edges_df
    .groupby('id_b')['test_recon']
    .sum()
    .reset_index()
    .query('test_recon > 0')
    .loc[:, 'id_b']
    .values
))

biorxiv_edges_df = (
    biorxiv_edges_df
    .loc[biorxiv_edges_df['id_a'].isin(nodes_with_recon_edge) 
         & biorxiv_edges_df['id_b'].isin(nodes_with_recon_edge)]
)

assert biorxiv_edges_df.groupby(['name_a', 'name_b']).size().max() == 1
assert biorxiv_edges_df.groupby(['id_a', 'id_b']).size().max() == 1
assert biorxiv_edges_df.query('test_recon == 0 and test_new == 1').shape[0]

In [4]:
biorxiv_edges_df.to_csv('../../../data/task3/2.edges/biorxiv.tsv.xz', compression='xz', index=False, sep='\t')

biorxiv_edges_df.head(2)

Unnamed: 0,name_a,name_b,id_a,id_b,train,test_recon,test_new
0,Ahmad Al Khleifat,- The US-Venezuela Collaborative Research Group,3227,9705,1,1,1
1,Alan Pittman,- The US-Venezuela Collaborative Research Group,7137,9705,0,1,1


In [5]:
%%time

biorxiv_df = analysis.process_edges_to_full_network(biorxiv_edges_df, biorxiv_mapping, allow_loop=False, directed=False)
biorxiv_df.to_csv('../../../data/task3/3.all_nodes/biorxiv.tsv.xz', compression='xz', index=False, sep='\t')

biorxiv_df.head(2)

CPU times: user 14min 8s, sys: 8.49 s, total: 14min 17s
Wall time: 14min 17s
