## This notebook shows the data preprocess code, can run directly from Colab environment

In [1]:
# mount GD
from google.colab import drive
drive.mount('/content/drive')

# your GD path to clone the repo
project_path="/content/drive/MyDrive/UofT_MEng/MIE1517/Project/FINDER_github/"

Mounted at /content/drive


In [None]:
# Clone repo
%cd {project_path}

!git clone https://github.com/faraz2023/FINDER-pytorch.git

%cd FINDER-pytorch
%ls -a

In [2]:
# if already cloned
%cd {project_path}/FINDER-pytorch/
!pwd

/content/drive/MyDrive/UofT_MEng/MIE1517/Project/FINDER_github/FINDER-pytorch
/content/drive/MyDrive/UofT_MEng/MIE1517/Project/FINDER_github/FINDER-pytorch


In [None]:
!pip install networkx==2.3

In [1]:
####  If not GD, run from local starts here ##############
project_path="/FINDER_CM"

In [6]:
# magic line for local jupyter notebooks
%matplotlib inline

In [2]:
# Load raw HI-II-14 data from .tsv format
import pandas as pd
import networkx as nx

# The data and web portal are made available to the public under the CC BY 4.0 license. 
# Users of the web portal or its data should cite the web portal and the HuRI publication.
# Data source: http://www.interactome-atlas.org/
# HuRI publication: https://pubmed.ncbi.nlm.nih.gov/25416956/

import os.path

# gather other protein networks
datasets = ["H-I-05","HI-II-14","HuRI","HI-union","Venkatesan-09","Yang-16","Test_space_screens-19","Yu-11","Lit-BM"]

for dateset_name in datasets:
    data_url = f"http://www.interactome-atlas.org/data/{dateset_name}.tsv"
    save_dir_g = f"{project_path}/FINDER-pytorch/data/real/{dateset_name}.txt"

    if(not os.path.isfile(save_dir_g)):
        raw_edge_list = pd.read_csv(data_url, sep='\t', names=['node_from','node_to'])

        # As we can see there are several self referencing edges, we would need to clean up those first
        uni_edge_list = raw_edge_list[raw_edge_list.node_from != raw_edge_list.node_to]

        # Now we need to mask all protein labels [GENCODE (v27)] such as ENSG00000204889 to index numbers
        edge_list = uni_edge_list.stack().rank(method='dense').unstack().astype(int)

        # Then we need to un-scale all index by 1, so it starts at 0
        edge_list['node_from']-=1
        edge_list['node_to']-=1

        # Now we use networkx lib to convert it into a graph
        G = nx.from_pandas_edgelist(edge_list, source='node_from', target='node_to')

        # We add weights to nodes (note it's not weights on edges)
        nx.set_node_attributes(G, 0.0, "weight")

        # write to edgelist file
        nx.write_edgelist(G, save_dir_g)

In [3]:
# Data source (strictly-new dataset)

data_url_union = f"http://www.interactome-atlas.org/data/HI-union.tsv"
data_url_14 = f"http://www.interactome-atlas.org/data/HI-II-14.tsv"

save_dir_g = f"{project_path}/FINDER-pytorch/data/real/HI-union-exclude-14.txt"

if(not os.path.isfile(save_dir_g)):
    raw_edge_list_union = pd.read_csv(data_url_union, sep='\t', names=['node_from','node_to'])
    raw_edge_list_14 = pd.read_csv(data_url_14, sep='\t', names=['node_from','node_to'])

    # As we can see there are several self referencing edges, we would need to clean up those first
    uni_edge_list_union = raw_edge_list_union[raw_edge_list_union.node_from != raw_edge_list_union.node_to]
    uni_edge_list_14 = raw_edge_list_14[raw_edge_list_14.node_from != raw_edge_list_14.node_to]
    
    # get HI-union - HI-II-14 by protein labels [GENCODE (v27)]
    node_from_mask_1 = uni_edge_list_union.node_from.isin(uni_edge_list_14.node_from)
    node_from_mask_2 = uni_edge_list_union.node_from.isin(uni_edge_list_14.node_to)
    
    node_to_mask_1 = uni_edge_list_union.node_to.isin(uni_edge_list_14.node_from)
    node_to_mask_2 = uni_edge_list_union.node_to.isin(uni_edge_list_14.node_to)
    
    node_mask = ~ ( ( node_from_mask_1 | node_from_mask_2 ) | ( node_to_mask_1 | node_to_mask_2 ) )
    
    uni_edge_list = uni_edge_list_union[node_mask]
    
    # Now we need to mask all protein labels [GENCODE (v27)] such as ENSG00000204889 to index numbers
    edge_list = uni_edge_list.stack().rank(method='dense').unstack().astype(int)
    print(edge_list.sort_values(by=['node_from']))

    # Then we need to un-scale all index by 1, so it starts at 0
    edge_list['node_from']-=1
    edge_list['node_to']-=1

    # Now we use networkx lib to convert it into a graph
    G = nx.from_pandas_edgelist(edge_list, source='node_from', target='node_to')

    # We add weights to nodes (note it's not weights on edges)
    nx.set_node_attributes(G, 0.0, "weight")

    nx.write_edgelist(G, save_dir_g)

In [4]:
# Adding weights for ND_cost models

import os.path

# build degree weight for ND_cost models
datasets = ["H-I-05","HI-II-14","HuRI","HI-union","Venkatesan-09","Yang-16","Test_space_screens-19","Yu-11","Lit-BM","HI-union-exclude-14"]

for dateset_name in datasets:
    g = nx.read_edgelist(f"{project_path}/FINDER-pytorch/data/real/{dateset_name}.txt")
    degree = nx.degree(g)
    maxDegree = max(dict(degree).values())
    weights = {}
    for node in g.nodes():
        weights[node] = degree[node] / maxDegree
    
    nx.set_node_attributes(g, weights, 'weight')
    save_dir_g = f"{project_path}/FINDER-pytorch/data/real/cost/{dateset_name}_degree.gml"
    if(not os.path.isfile(save_dir_g)):
        nx.write_gml(g, save_dir_g)