[![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giordamaug/EG-identification---Data-Science-in-App-Springer/blob/main/notebook/EssentialGenes_Karateclub.ipynb)
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/giordamaug/EG-identification---Data-Science-in-App-Springer/main?filepath=notebook%2FEssentialGenes_Karateclub.ipynb)

# Loading required libraries

### Install libraries

In [2]:
import sys
IN_COLAB = 'google.colab' in sys.modules
!pip install -q karateclub
if not IN_COLAB:
    !pip install -q pandas
    !pip install -q sklearn
    !pip install -q imblearn
    !pip install -q xgboost
    !pip install -q tqdm

In [3]:
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import pandas as pd
def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)

# Download dataset from Github

In [4]:
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/ppi.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/labels.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/bio_attributes.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/net_attributes.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/gtex_attributes.csv

--2022-05-12 13:51:22--  https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/ppi.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3966521 (3.8M) [text/plain]
Saving to: ‘ppi.csv’


2022-05-12 13:51:23 (43.8 MB/s) - ‘ppi.csv’ saved [3966521/3966521]

--2022-05-12 13:51:23--  https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/labels.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 237495 (232K) [text/plain]
Saving to: ‘labe

# Load the label
Only a subset of genes are selected for classification:
+ genes belonging to CS0 group, that are labeled as Essential (E);
+ genes belonging to CS6, CS7, ..., CS9 groups, that are labeled as Not-Essential (NE).

All remaining genes belong to intermediate groups (CS1-CS5) and are considered undetermined (label ND) 

In [5]:
labels = pd.read_csv("labels.csv", index_col='name')
labels = labels[labels["CS0_vs_CS6-9"].isin(['E', 'NE']) == True]       # drop any gene with undefined (ND) label
genes = labels.index.values                                             # get genes with defined labels (E or NE)
print(f'Selected {len(genes)} genes')

Selected 3814 genes


## Encode the labels
String labels E and Ne are respectively encoded to 0 and 1.
The array `y` containes numeric labels of genes.

In [6]:
from sklearn import preprocessing
from collections import Counter
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(labels['CS0_vs_CS6-9'].values)  
classes_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(classes_mapping, Counter(y))

{'E': 0, 'NE': 1} Counter({1: 3069, 0: 745})


# Load attributes to be used
We identified three sets of attributes:
1. bio attributes, related to gene information (such as, expression, etc.)
2. net attributes, derived from role of gene/node in the network (such as, degree, centrality, etc.)
3. GTEX-* attribute, additional biological information of genes 
Based on user selection, the node attributes are appended in a single matrix of attributes (`x`)

In the attribute matrix `x` there can be NaN or Infinite values. They are corrected as it follow:
+ NaN is replaced by the mean in the attribute range, 
+ Infinte value is replaced by the maximum in the range.

After Nan and Infinite values fixing, the attributes are normalized with Z-score or MinMax normalization functions.

At the end, only nodes (genes) with E or NE labels are selected for the classification

In [7]:
#@title Choose attributes { form-width: "20%" }
normalize_node = "zscore" #@param ["", "zscore", "minmax"]
bio = True #@param {type:"boolean"}
gtex = True #@param {type:"boolean"}
net = False #@param {type:"boolean"}
variable_name = "bio"
bio_df = pd.read_csv("bio_attributes.csv", index_col='name') if bio else pd.DataFrame()
gtex_df = pd.read_csv("gtex_attributes.csv", index_col='name') if gtex else pd.DataFrame()
net_df = pd.read_csv("net_attributes.csv", index_col='name') if net else pd.DataFrame()
x = pd.concat([bio_df, gtex_df, net_df], axis=1)
print(f'Found {x.isnull().sum().sum()} NaN values and {np.isinf(x).values.sum()} Infinite values')
for col in x.columns[x.isna().any()].tolist():
  mean_value=x[col].mean()          # Replace NaNs in column with the mean of values in the same column
  if mean_value is not np.nan:
    x[col].fillna(value=mean_value, inplace=True)
  else:                             # otherwise, if the mean is NaN, remove the column
    x = x.drop(col, 1)
if normalize_node == 'minmax':
  print("X attributes normalization (minmax)...")
  x = (x-x.min())/(x.max()-x.min())
elif normalize_node == 'zscore':
  print("X attributes normalization (zscore)...")
  x = (x-x.mean())/x.std()
x = x.loc[genes]
print(f'New attribute matrix x{x.shape}')

Found 15705 NaN values and 0 Infinite values
X attributes normalization (zscore)...
New attribute matrix x(3814, 105)


# Load the PPI+MET network
The PPI networks is loaded from a CSV file, where
*   `A` is the column name for edge source (gene name)
*   `B` is the column name for edge target (gene name)
*   `weight` is the column name for edge weight
Only some method use the PPI netoworks, as an example all GCN methods, and Node2Vec.

The PPI+MET network is reduced by removing genes with undetermined labels

In [25]:
import networkx as nx
ppi = pd.read_csv('ppi.csv')                                               # read PPI+MET network from CSV file
ppi = ppi.loc[((ppi['A'].isin(genes)) & (ppi['B'].isin(genes)))]           # reduce network only to selected nodes/genes
G = nx.from_pandas_edgelist(ppi, edge_attr='weight', source='A', target='B')
G = nx.relabel.convert_node_labels_to_integers(G, first_label=0, ordering='default')   # node relabeling with integers
nx.info(G)

'Graph with 3690 nodes and 107513 edges'

# Network embedding with Karateclub

In [21]:
import karateclub as kc
params = {"walk_number": 10, 
          "walk_length": 80, 
          "p": 1.0, 
          "q": 1.0, 
          "dimensions": 128, 
          "workers": 4, 
          "window_size": 5, 
          "epochs": 1, 
          "learning_rate": 0.05, 
          "min_count": 1, 
          "seed": 42}
n2v = kc.Node2Vec(**params)
n2v.fit(G)
n2v.get_embedding()

AssertionError: ignored

# k-fold cross validation

### Validate

In [None]:
#@title Choose classifier { run: "auto", form-width: "20%" }
method = "XGB" #@param ["SVM", "XGB", "RF", "MLP", "RUS", "DUMMY"]
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector as selector
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neural_network import MLPClassifier
import sys
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from tqdm import tqdm
from sklearn.metrics import *
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.dummy import DummyClassifier
seed=1
set_seed(seed)
nfolds = 5
kf = KFold(n_splits=nfolds)
accuracies, mccs = [], []
cm = [[0,0],[0,0]]
X = x.to_numpy()

if method == 'SVM':
  clf = SVC(random_state=seed, probability=True)
elif method == 'MLP':
  clf = MLPClassifier(random_state=seed)
elif method == 'RF':
  clf = RandomForestClassifier(random_state=seed)
elif method == 'XGB':
  clf = XGBClassifier(random_state=seed)
elif method == 'RUS':
  clf = RUSBoostClassifier(random_state=seed)
else:
  raise ValueError("Method not implemented!")

cma = np.array([[0,0],[0,0]])
mm = np.array([], dtype=np.int)
predictions = np.array([])
columns_names = ["Accuracy","BA", "Sensitivity", "Specificity","MCC", 'CM']
scores = pd.DataFrame(columns=columns_names)
for fold, (train_idx, test_idx) in enumerate(tqdm(kf.split(np.arange(len(X))), total=kf.get_n_splits(), desc=f"{nfolds}-fold")):
    train_x, train_y, test_x, test_y = X[train_idx], y[train_idx], X[test_idx], y[test_idx],
    mm = np.concatenate((mm, test_idx))
    probs = clf.fit(train_x, train_y).predict(test_x)
    preds = (probs > 0.5) * 1
    cm = confusion_matrix(test_y, preds)
    cma += cm
    predictions = np.concatenate((predictions, preds))
    scores = scores.append(pd.DataFrame([[accuracy_score(test_y, preds), balanced_accuracy_score(test_y, preds), 
        cm[0,0]/(cm[0,0]+cm[0,1]), cm[1,1]/(cm[1,0]+cm[1,1]), 
        matthews_corrcoef(test_y, preds), cm]], columns=columns_names, index=[fold]))
df_scores = pd.DataFrame(scores.mean(axis=0)).T
df_scores.index=[f'{method}']
df_scores['CM'] = [cma]
df_scores

# Print predictions

In [None]:
p = np.zeros(len(y))
p[mm] = predictions
labels['predictions'] = ['NE' if x>0 else 'E' for x in p]
labels