[![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giordamaug/EG-identification---Data-Science-in-App-Springer/blob/main/notebook/EssentialGenes_Karateclub.ipynb)
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/giordamaug/EG-identification---Data-Science-in-App-Springer/main?filepath=notebook%2FEssentialGenes_Karateclub.ipynb)

# Loading required libraries

### Install libraries

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules
if not IN_COLAB:
    !pip install -q karateclub
    !pip install -q pandas
    !pip install -q sklearn
    !pip install -q imblearn
    !pip install -q xgboost
    !pip install -q tqdm

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfac 0.2.3 requires tensorflow-probability==0.8, but you have tensorflow-probability 0.7.0 which is incompatible.
bamboolib 1.26.0 requires cryptography<3.0.0,>=2.6.1, but you have cryptography 36.0.0 which is incompatible.
bamboolib 1.26.0 requires scikit-learn<1.0.0,>=0.20.2, but you have scikit-learn 1.0.2 which is incompatible.
bamboolib 1.26.0 requires seaborn<0.11,>=0.10, but you have seaborn 0.11.2 which is incompatible.[0m


In [2]:
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import pandas as pd
def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)

# Download dataset from Github

In [3]:
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/ppi.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/labels.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/bio_attributes.csv
#!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/net_attributes.csv   you don't need it
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/gtex_attributes.csv

zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget


# Load the label
Only a subset of genes are selected for classification:
+ genes belonging to CS0 group, that are labeled as Essential (E);
+ genes belonging to CS6, CS7, ..., CS9 groups, that are labeled as Not-Essential (NE).

All remaining genes belong to intermediate groups (CS1-CS5) and are considered undetermined (label ND) 

In [7]:
labels = pd.read_csv("labels.csv", index_col='name')
labels = labels[labels["CS0_vs_CS6-9"].isin(['E', 'NE']) == True]       # drop any gene with undefined (ND) label
genes = labels.index.values                                             # get genes with defined labels (E or NE)
print(f'Selected {len(genes)} genes')

Selected 3814 genes


## Encode the labels
String labels E and Ne are respectively encoded to 0 and 1.
The array `y` containes numeric labels of genes.

In [8]:
from sklearn import preprocessing
from collections import Counter
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(labels['CS0_vs_CS6-9'].values)  
classes_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(classes_mapping, Counter(y))

{'E': 0, 'NE': 1} Counter({1: 3069, 0: 745})


# Load attributes to be used
We identified three sets of attributes:
1. bio attributes, related to gene information (such as, expression, etc.)
2. net attributes, derived from role of gene/node in the network (such as, degree, centrality, etc.)
3. GTEX-* attribute, additional biological information of genes 
Based on user selection, the node attributes are appended in a single matrix of attributes (`x`)

In the attribute matrix `x` there can be NaN or Infinite values. They are corrected as it follow:
+ NaN is replaced by the mean in the attribute range, 
+ Infinte value is replaced by the maximum in the range.

After Nan and Infinite values fixing, the attributes are normalized with Z-score or MinMax normalization functions.

At the end, only nodes (genes) with E or NE labels are selected for the classification

In [9]:
#@title Choose attributes { form-width: "20%" }
normalize_node = "zscore" #@param ["", "zscore", "minmax"]
bio = True #@param {type:"boolean"}
gtex = True #@param {type:"boolean"}
net = True #@param {type:"boolean"}
variable_name = "bio"
bio_df = pd.read_csv("bio_attributes.csv", index_col='name') if bio else pd.DataFrame()
gtex_df = pd.read_csv("gtex_attributes.csv", index_col='name') if gtex else pd.DataFrame()
net_df = pd.read_csv("net_attributes.csv", index_col='name') if net else pd.DataFrame()
x = pd.concat([bio_df, gtex_df, net_df], axis=1)
print(f'Found {x.isnull().sum().sum()} NaN values and {np.isinf(x).values.sum()} Infinite values')
for col in x.columns[x.isna().any()].tolist():
  mean_value=x[col].mean()          # Replace NaNs in column with the mean of values in the same column
  if mean_value is not np.nan:
    x[col].fillna(value=mean_value, inplace=True)
  else:                             # otherwise, if the mean is NaN, remove the column
    x = x.drop(col, 1)
if normalize_node == 'minmax':
  print("X attributes normalization (minmax)...")
  x = (x-x.min())/(x.max()-x.min())
elif normalize_node == 'zscore':
  print("X attributes normalization (zscore)...")
  x = (x-x.mean())/x.std()
x = x.loc[genes]
print(f'New attribute matrix x{x.shape}')

Found 15919 NaN values and 0 Infinite values
X attributes normalization (zscore)...
New attribute matrix x(3814, 119)


# Load the PPI+MET network
The PPI networks is loaded from a CSV file, where
*   `A` is the column name for edge source (gene name)
*   `B` is the column name for edge target (gene name)
*   `weight` is the column name for edge weight
Only some method use the PPI netoworks, as an example all GCN methods, and Node2Vec.

The PPI+MET network is reduced by removing genes with undetermined labels

In [12]:
ppi = pd.read_csv('ppi.csv')                                               # read PPI+MET network from CSV file
ppi = ppi.loc[((ppi['A'].isin(genes)) & (ppi['B'].isin(genes)))]           # reduce network only to selected nodes/genes
idxlbl = labels.reset_index(drop=True)
idxlbl['name'] = labels.index
map_gene_to_idx = { v['name']: i  for i,v in idxlbl.to_dict('Index').items() }
vfunc = np.vectorize(lambda t: map_gene_to_idx[t])
edges_index = torch.from_numpy(vfunc(ppi[['A','B']].to_numpy().T)) 

# Network embedding with Karateclub

# k-fold cross validation

### Validate

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from tqdm import tqdm
from sklearn.metrics import *

set_seed(1)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NFOLDS = 5
LR = 1e-2
WEIGHT_DECAY = 5e-4
EPOCHS = 50

X = torch.tensor(x.to_numpy(), dtype=torch.float)
kf = KFold(n_splits=NFOLDS)
cma = np.array([[0,0],[0,0]])
columns_names = ["Accuracy","BA", "Sensitivity", "Specificity","MCC", 'CM']
scores = pd.DataFrame(columns=columns_names)
mm = np.array([], dtype=np.int)
predictions = np.array([])
for fold, (train_index, test_idx) in enumerate(tqdm(kf.split(np.arange(len(X))), total=kf.get_n_splits(), desc=f"{NFOLDS}-fold")):
    train_idx, val_idx = train_test_split(train_index, test_size=0.05, stratify=y[train_index])
    mm = np.concatenate((mm, test_idx))
    train_y = torch.tensor(y[train_idx], dtype=torch.float)
    val_y = torch.tensor(y[val_idx], dtype=torch.float)
    test_y = torch.tensor(y[test_idx], dtype=torch.float).to(DEVICE)
    train_x = torch.tensor(X[train_idx], dtype=torch.float).to(DEVICE)
    val_x = torch.tensor(X[val_idx], dtype=torch.float).to(DEVICE)
    test_x = torch.tensor(X[test_idx], dtype=torch.float).to(DEVICE)
    probs = n2v_fit_predict(edges_index, X, train_y, train_idx, val_y, val_idx, test_idx, epochs=EPOCHS, log=False)
    preds = (probs > 0.5) * 1
    predictions = np.concatenate((predictions, preds.ravel()))
    cm = confusion_matrix(test_y.cpu().numpy(),preds)
    cma += cm
    scores = scores.append(pd.DataFrame([[accuracy_score(test_y.cpu().numpy(), preds), balanced_accuracy_score(test_y.cpu().numpy(), preds), 
        cm[0,0]/(cm[0,0]+cm[0,1]), cm[1,1]/(cm[1,0]+cm[1,1]), 
        matthews_corrcoef(test_y.cpu().numpy(), preds), cm]], columns=columns_names, index=[fold]))
df_scores = pd.DataFrame(scores.mean(axis=0)).T
df_scores.index=[f'N2V']
df_scores['CM'] = [cma]
df_scores

# Print predictions

In [None]:
p = np.zeros(len(y))
p[mm] = predictions
labels['predictions'] = ['NE' if x>0 else 'E' for x in p]
labels