# Loading required libraries

### Install Pytorch libraries

In [1]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-cluster -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

[K     |████████████████████████████████| 7.9 MB 35.0 MB/s 
[K     |████████████████████████████████| 3.5 MB 34.5 MB/s 
[K     |████████████████████████████████| 2.5 MB 18.0 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [2]:
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import torch
def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

set_seed()

# Download dataset from Github

In [3]:
!wget https://raw.githubusercontent.com/giordamaug/BIONETdatasets/main/CSV/integratedcrispr/edges_integrated2.csv
!wget https://raw.githubusercontent.com/giordamaug/BIONETdatasets/main/CSV/integratedcrispr/nodes_integrated2_update_attr_label_avana0_wang_crispr.csv

--2022-04-30 07:19:31--  https://raw.githubusercontent.com/giordamaug/BIONETdatasets/main/CSV/integratedcrispr/edges_integrated2.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39025931 (37M) [text/plain]
Saving to: ‘edges_integrated2.csv’


2022-04-30 07:19:32 (377 MB/s) - ‘edges_integrated2.csv’ saved [39025931/39025931]

--2022-04-30 07:19:32--  https://raw.githubusercontent.com/giordamaug/BIONETdatasets/main/CSV/integratedcrispr/nodes_integrated2_update_attr_label_avana0_wang_crispr.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200

# Load the attributes and labels from CSV

All node/gene attributes and labels are in one file. Firs we devide attributes (numeri) from labels (non-numeric) columns.



In [4]:
import pandas as pd
from sklearn import preprocessing
node_name, label_name = 'name', 'label_CS_ACH_most_freq'
# read node attributes
df = pd.read_csv('/content/nodes_integrated2_update_attr_label_avana0_wang_crispr.csv', 
                 sep='\t', index_col=node_name)   # load csv and set 'name' as index
x = df.select_dtypes(include=np.number)           # Attributes are numeric columns
labels = df.select_dtypes(exclude=np.number)      # labels are not numeric columns

# Set the label
The label is one column or a grouping of values from one label column. In the paper we considered the ten values of `label_CS_ACH_most_freq`, and we grouped as following:
*   `CS0` = `E`ssential class
*   `CS6`-`CS9` = `NE` non essential class

The new label name is `CS0_vs_CS6-9`





In [5]:
E_class, NE_class = ['CS0'], ['CS6', 'CS7', 'CS8', 'CS9']
new_label_name = 'CS0_vs_CS6-9'
labels[new_label_name] = labels.apply(lambda row: 'E' if row[label_name] in E_class \
                                      else 'NE' if row[label_name] in NE_class \
                                      else row[label_name], axis=1)
labels = labels[labels[new_label_name].isin(['E', 'NE']) == True]       # drop any row contaning NaN or SC1-SC5 as value
genes = labels.index.values
print(f'Selected {len(genes)} genes')

Selected 3814 genes


# Encode the label

In [6]:
from sklearn import preprocessing
from collections import Counter
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(labels[new_label_name].values)  
classes_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(classes_mapping, Counter(y))

{'E': 0, 'NE': 1} Counter({1: 3069, 0: 745})


# Load the PPI network
The PPI networks is loaded from a CSV file, where
*   `A` is the column name for edge source (gene name)
*   `B` is the column name for edge target (gene name)
*   `weight` is the column name for edge weight
Only some method use the PPI netoworks, as an example all GCN methods, and Node2Vec.



In [7]:
ppi = pd.read_csv('/content/edges_integrated2.csv', sep='\t')              # read ppi from CSV file
ppi = ppi.loc[((ppi['A'].isin(genes)) & (ppi['B'].isin(genes)))]           # reduce PPI only to selected nodes/genes
idxlbl = labels.reset_index(drop=True)
idxlbl[node_name] = labels.index
map_gene_to_idx = { v[node_name]: i  for i,v in idxlbl.to_dict('Index').items() }
vfunc = np.vectorize(lambda t: map_gene_to_idx[t])
edges_index = torch.from_numpy(vfunc(ppi[['A','B']].to_numpy().T)) 

# Select attributes to be used
We identified three sets of attributes:
1. bio attributes, related to gene information (such as, expression, etc.)
2. net attributes, derived from role of gene/node in the network (such as, degree, centrality, etc.)
3. GTEX-* attribute, additional biological information of genes 

In this code snippets the sets of attibutes are defines, and you may choose to subtract some of them from the matrix of node attributes 


In [8]:
#@title Choose attributes { form-width: "20%" }
import re
bio_attr = True #@param {type:"boolean"}
net_attr = True #@param {type:"boolean"}
gtex_attr = True #@param {type:"boolean"}
bio_attributes = ['gc_content', 'Gtex_kidney', 'gene_disease_ass_count', 'oncodb_expression','orth_count', 'gene_length', \
       'HPA_kidney', 'mf_coal', 'bp_coal', 'cc_coal', 'biogrid_coal', 'kegg_coal', 'reactome_coal', 'ucsc_tfbs_coal', \
       'up_tissue_coal', 'transcript_count']  if bio_attr else []
net_attributes = ['degree', 'ecc', 'clos', 'betw', 'eigen', 'hub', 'trans', 'PR', 'triangles_numb', 'motif1', \
        'motif2', 'motif3', 'motif5', 'strength'] if net_attr else []
r = re.compile('^GTEX*')
gtex_attributes = list(filter(r.match, x.columns)) if gtex_attr else []
x = x.filter(items=bio_attributes+gtex_attributes+net_attributes)

## Normalize attributes
In this snippet of code the matrix of node attributes is corrected by filling NaN with themean in the columns, while Infinte value with maximum. 

The attribute matrix is also reduce by removing all rows correspondning to node not considered, i.e. deleted becaus no label was associated to them.

In [9]:
#@title Normalization modes { form-width: "30%" }
normalize_node = "zscore" #@param ["", "zscore", "minmax"]
print(f'Fixing NaN and infinity in X matrix...', end='')
print(f'Found {x.isnull().sum().sum()} NaN values and {np.isinf(x).values.sum()} Inf values')
highest_non_inf = x.max().loc[lambda v: v<np.Inf].max()    # fix infinity (replace with max)
x.replace(np.Inf, highest_non_inf)
for col in x.columns[x.isna().any()].tolist():
  mean_value=x[col].mean()          # Replace NaNs in column with the  mean of values in the same column
  if mean_value is not np.nan:
    x[col].fillna(value=mean_value, inplace=True)
  else:
    x = x.drop(col, 1)
if normalize_node == 'minmax':
  print("X attributes normalization (minmax)...")
  x = (x-x.min())/(x.max()-x.min())
elif normalize_node == 'zscore':
  print("X attributes normalization (zscore)...")
  x = (x-x.mean())/x.std()
x = x.loc[genes]
print(f'New attribute matrix x{x.shape}')

Fixing NaN and infinity in X matrix...Found 15919 NaN values and 0 Inf values
X attributes normalization (zscore)...
New attribute matrix x(3814, 119)


# k-fold cross validation with Node2Vec

### MLP model

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Loss():
    def __init__(self, y):
        self.y = y
        self.pos_mask = y == 1
        self.neg_mask = y == 0

    def __call__(self, out):
        pos_mask = self.pos_mask
        neg_mask = self.neg_mask
        loss_p = F.binary_cross_entropy_with_logits(
            out[pos_mask].squeeze(), self.y[self.pos_mask].to(DEVICE))
        loss_n = F.binary_cross_entropy_with_logits(
            out[neg_mask].squeeze(), self.y[neg_mask].to(DEVICE))
        loss = loss_p + loss_n
        return loss

def mlp_fit_predict(train_x, train_y, test_x, val=None, return_val_probs=False, log=False):
    epochs = 1000

    in_feats = train_x.shape[1]
    model = nn.Sequential(
        nn.Linear(in_feats, 32),
        nn.ReLU(),
        nn.Dropout(0.2),
        nn.Linear(32, 1))
    optimizer = torch.optim.Adam(model.parameters())

    lossf = Loss(train_y)

    if val is not None:
        val_x, val_y = val
        lossf_val = Loss(val_y)

    model.train()
    model.to(DEVICE)

    patience, cur_es = 3, 0
    val_loss_old = np.Inf

    for i in range(epochs):
        out = model(train_x)
        loss = lossf(out)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i % 10) == 0:
            if val is not None:
                model.eval()
                with torch.no_grad():
                    loss_val = lossf_val(model(val_x))
                if log: print(f'{i}. Train loss:', loss.detach().cpu().numpy(), ' |  Val Loss:', loss_val.detach().cpu().numpy())
                model.train()

                if val_loss_old < loss_val:
                    cur_es += 1
                else:
                    cur_es = 0
                val_loss_old = loss_val

                if cur_es == patience:
                    break

    model.eval()
    with torch.no_grad():
        out = model(test_x).cpu()
    probs = torch.sigmoid(out).numpy()

    if return_val_probs:
        with torch.no_grad():
            out = model(val_x).cpu()
        val_probs = torch.sigmoid(out).numpy()

        return probs, val_probs

    return probs

### Node2vec model

In [None]:
from sklearn.svm import SVC
from torch_geometric.nn.models import Node2Vec
import torch.optim as optim
import torch_cluster
PARAMS = {
    'embedding_dim': 128,
    'walk_length': 64,
    'context_size': 64,
    'walks_per_node': 64,
    'num_negative_samples': 1,
}
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_epoch(n2v, n2v_loader, n2v_optimizer, X, train_y, train_mask, val_y, val_mask, test_mask, epochs=100, log=False):

    X = X.to(DEVICE)
    train_y = train_y.to(DEVICE)
    val_y = val_y.to(DEVICE)
    Z = None

    n2v.train()
    for i in range(epochs):
        n2v_train_loss = 0

        for (pos_rw, neg_rw) in n2v_loader:
            n2v_optimizer.zero_grad()
            loss = n2v.loss(pos_rw.to(DEVICE), neg_rw.to(DEVICE))
            loss.backward()
            n2v_optimizer.step()
            n2v_train_loss += loss.data.item()
        if log: print(f'Epoch {i}. N2V Train_Loss:', n2v_train_loss)
    n2v.eval()
    Z = n2v().detach()

    if X is None:
        train_x = Z[train_mask]
        val_x = Z[val_mask]
        test_x = Z[test_mask]
    elif Z is not None:
        train_x = torch.cat([Z[train_mask], X[train_mask]], dim=1)
        val_x = torch.cat([Z[val_mask], X[val_mask]], dim=1)
        test_x = torch.cat([Z[test_mask], X[test_mask]], dim=1)
    else:
        train_x = X[train_mask]
        val_x = X[val_mask]
        test_x = X[test_mask]
    if log: print('train_X.shape', train_x.shape)

    probs, val_probs = mlp_fit_predict(
        train_x, train_y, test_x, val=(val_x, val_y), return_val_probs=True)
    val_roc_auc = roc_auc_score(val_y.cpu().numpy(), val_probs)

    if log: print('Validation ROC_AUC:', val_roc_auc)
    return probs, val_roc_auc


def n2v_fit_predict(edge_index, X, train_y, train_mask, val_y, val_mask, test_mask, epochs=100, log=False):
    n2v = Node2Vec(edge_index, **PARAMS).to(DEVICE)
    n2v_loader = n2v.loader(batch_size=128, shuffle=True, num_workers=0)
    n2v_optimizer = optim.Adam(n2v.parameters(), lr=LR)

    probs, val_roc_auc = train_epoch(
        n2v, n2v_loader, n2v_optimizer, X, train_y, train_mask, val_y, val_mask, test_mask, epochs=epochs, log=log)

    return probs


### Validate

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from tqdm import tqdm
from sklearn.metrics import *

set_seed(1)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NFOLDS = 5
LR = 1e-2
WEIGHT_DECAY = 5e-4
EPOCHS = 50

X = torch.tensor(x.to_numpy(), dtype=torch.float)
kf = KFold(n_splits=NFOLDS)
cma = np.array([[0,0],[0,0]])
columns_names = ["Accuracy","BA", "Sensitivity", "Specificity","MCC", 'CM']
scores = pd.DataFrame(columns=columns_names)
mm = np.array([], dtype=np.int)
predictions = np.array([])
for fold, (train_index, test_idx) in enumerate(tqdm(kf.split(np.arange(len(X))), total=kf.get_n_splits(), desc=f"{NFOLDS}-fold")):
    train_idx, val_idx = train_test_split(train_index, test_size=0.05, stratify=y[train_index])
    mm = np.concatenate((mm, test_idx))
    train_y = torch.tensor(y[train_idx], dtype=torch.float)
    val_y = torch.tensor(y[val_idx], dtype=torch.float)
    test_y = torch.tensor(y[test_idx], dtype=torch.float).to(DEVICE)
    train_x = torch.tensor(X[train_idx], dtype=torch.float).to(DEVICE)
    val_x = torch.tensor(X[val_idx], dtype=torch.float).to(DEVICE)
    test_x = torch.tensor(X[test_idx], dtype=torch.float).to(DEVICE)
    probs = n2v_fit_predict(edges_index, X, train_y, train_idx, val_y, val_idx, test_idx, epochs=EPOCHS, log=False)
    preds = (probs > 0.5) * 1
    predictions = np.concatenate((predictions, preds.ravel()))
    cm = confusion_matrix(test_y.cpu().numpy(),preds)
    cma += cm
    scores = scores.append(pd.DataFrame([[accuracy_score(test_y.cpu().numpy(), preds), balanced_accuracy_score(test_y.cpu().numpy(), preds), 
        cm[0,0]/(cm[0,0]+cm[0,1]), cm[1,1]/(cm[1,0]+cm[1,1]), 
        matthews_corrcoef(test_y.cpu().numpy(), preds), cm]], columns=columns_names, index=[fold]))
df_scores = pd.DataFrame(scores.mean(axis=0)).T
df_scores.index=[f'N2V']
df_scores['CM'] = [cma]
print(df_scores.to_latex())
p = np.zeros(len(y))
p[mm] = predictions

# Print predictions

In [None]:
print(np.unique(y+p, return_counts=True))
labels['predictions'] = p
labels[(labels['predictions'] == 0 ) & ( labels['CS0_vs_CS6-9'] == 'E')].index
f = open("N2V+MLP_Egenes.csv", "w")
f.write('\n'.join([str(e) for e in list(labels[(labels['predictions'] == 0 ) & ( labels['CS0_vs_CS6-9'] == 'E')].index)]))
f.close()
labels[['CS0_vs_CS6-9', 'predictions']].to_csv('N2V+MLP_Predictions.csv')