[![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giordamaug/EG-identification---Data-Science-in-App-Springer/blob/main/notebook/EssentialGenes_ML.ipynb)
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/giordamaug/EG-identification---Data-Science-in-App-Springer/master/notebook/EssentialGenes_ML.ipynb)
<a href="https://kaggle.com/kernels/welcome?src=https://github.com/giordamaug/EG-identification---Data-Science-in-App-Springer/blob/main/notebook/EssentialGenes_ML.ipynb" target="_parent"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Open In Kaggle"/></a>

# Loading required libraries

In [35]:
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import pandas as pd
def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)

# Download dataset from Github

In [36]:
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/ppi.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/labels.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/bio_attributes.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/net_attributes.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/gtex_attributes.csv

--2022-05-03 22:06:40--  https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/ppi.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3966521 (3.8M) [text/plain]
Saving to: ‘ppi.csv.1’


2022-05-03 22:06:40 (48.6 MB/s) - ‘ppi.csv.1’ saved [3966521/3966521]

--2022-05-03 22:06:40--  https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/labels.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71739 (70K) [text/plain]
Saving to: ‘la

# Load the label (and encode)

In [37]:
labels = pd.read_csv("labels.csv", index_col='name')
from sklearn import preprocessing
from collections import Counter
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(labels['CS0_vs_CS6-9'].values)  
classes_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(classes_mapping, Counter(y))

{'E': 0, 'NE': 1} Counter({1: 3069, 0: 745})


# Load the PPI network
The PPI networks is loaded from a CSV file, where
*   `A` is the column name for edge source (gene name)
*   `B` is the column name for edge target (gene name)
*   `weight` is the column name for edge weight
Only some method use the PPI netoworks, as an example all GCN methods, and Node2Vec.



In [38]:
ppi = pd.read_csv('ppi.csv')                   # read ppi from CSV file

# Load attributes to be used
We identified three sets of attributes:
1. bio attributes, related to gene information (such as, expression, etc.)
2. net attributes, derived from role of gene/node in the network (such as, degree, centrality, etc.)
3. GTEX-* attribute, additional biological information of genes 

In this code snippets you choose:
+ the attributes to include, 
+ the normalization function for node attributes (Z-swcore or MinMax)

After loading, valuew of each attribute are corrected by filling NaN with the mean in the range, while Infinte value with the maximum in the range.


In [48]:
#@title Choose attributes { form-width: "20%" }
normalize_node = "zscore" #@param ["", "zscore", "minmax"]
bio = False #@param {type:"boolean"}
gtex = False #@param {type:"boolean"}
net = True #@param {type:"boolean"}
variable_name = "bio"
bio_df = pd.read_csv("bio_attributes.csv", index_col='name') if bio else pd.DataFrame()
gtex_df = pd.read_csv("gtex_attributes.csv", index_col='name') if gtex else pd.DataFrame()
net_df = pd.read_csv("net_attributes.csv", index_col='name') if net else pd.DataFrame()
x = pd.concat([bio_df, gtex_df, net_df], axis=1)
print(f'Found {x.isnull().sum().sum()} NaN values and {np.isinf(x).values.sum()} Infinite values')
for col in x.columns[x.isna().any()].tolist():
  mean_value=x[col].mean()          # Replace NaNs in column with the mean of values in the same column
  if mean_value is not np.nan:
    x[col].fillna(value=mean_value, inplace=True)
  else:                             # otherwise, if the mean is NaN, remove the column
    x = x.drop(col, 1)
if normalize_node == 'minmax':
  print("X attributes normalization (minmax)...")
  x = (x-x.min())/(x.max()-x.min())
elif normalize_node == 'zscore':
  print("X attributes normalization (zscore)...")
  x = (x-x.mean())/x.std()
x

Found 67 NaN values and 0 Infinite values
X attributes normalization (zscore)...


Unnamed: 0_level_0,degree,ecc,clos,betw,eigen,hub,trans,PR,triangles_numb,motif1,motif2,motif3,motif5,strength
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSG00000001036,-0.449888,0.734545,0.516306,-0.111520,0.635709,0.635709,1.689553,-0.198491,-0.323159,-0.262870,-0.366587,-0.413753,-0.334908,-0.377416
ENSG00000001461,-0.686831,0.734545,-1.913563,-0.117495,-0.241634,-0.241634,-0.604062,-0.388473,-0.352559,1.871153,1.518046,2.046828,0.867106,2.974460
ENSG00000001561,-0.670767,0.734545,0.726833,-0.117495,-0.241682,-0.241682,-0.341207,-0.374996,-0.352430,0.571236,-0.039539,-0.289543,-0.207759,-0.320851
ENSG00000001630,3.188598,-1.253783,0.272544,-0.117453,1.776662,1.776662,0.295472,0.939715,3.648337,-0.382918,-0.206605,-0.036322,-0.049724,0.770654
ENSG00000001631,-0.417760,-1.253783,-2.014077,-0.117495,-0.231378,-0.231378,-0.915414,-0.224040,-0.345494,-0.356584,-0.388387,-0.292171,-0.337920,-0.060934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000288283,-0.554303,0.734545,0.791766,0.441148,-0.241729,-0.241729,-0.852707,-0.395741,-0.350489,-0.433221,-0.458901,-0.411040,-0.345450,-0.369035
ENSG00000288359,-0.686831,0.734545,0.346552,-0.115473,-0.241801,-0.241801,-1.524051,-0.399182,-0.352585,0.980115,0.458642,0.744192,0.289282,0.047911
ENSG00000288407,-0.381616,0.734545,0.639412,1.718065,-0.241748,-0.241748,-1.190409,-0.389895,-0.347642,-0.255933,-0.349041,-0.432552,-0.337944,-0.461633
ENSG00000288478,-0.690847,0.734545,0.400465,-0.115944,-0.241824,-0.241824,-1.524051,-0.398743,-0.352585,-0.434282,-0.501666,-0.472163,-0.348585,-0.419407


# k-fold cross validation with: SVM, RF, XGB, MLP, RUS

In [53]:
#@title Choose classifier { run: "auto", form-width: "20%" }
method = "XGB" #@param ["SVM", "XGB", "RF", "MLP", "RUS", "DUMMY"]
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector as selector
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neural_network import MLPClassifier
import sys
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from tqdm import tqdm
from sklearn.metrics import *
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import RUSBoostClassifier
from sklearn.dummy import DummyClassifier
seed = 123
set_seed(seed)
nfolds = 5
kf = KFold(n_splits=nfolds)
accuracies, mccs = [], []
cm = [[0,0],[0,0]]
X = x.to_numpy()

if method == 'SVM':
  clf = SVC(random_state=seed, probability=True)
elif method == 'MLP':
  clf = MLPClassifier(random_state=seed)
elif method == 'RF':
  clf = RandomForestClassifier(random_state=seed)
elif method == 'XGB':
  clf = XGBClassifier(random_state=seed)
elif method == 'RUS':
  clf = RUSBoostClassifier(random_state=seed)
else:
  raise ValueError("Method not implemented!")

cma = np.array([[0,0],[0,0]])
mm = np.array([], dtype=np.int)
predictions = np.array([])
columns_names = ["Accuracy","BA", "Sensitivity", "Specificity","MCC", 'CM']
scores = pd.DataFrame(columns=columns_names)
for fold, (train_idx, test_idx) in enumerate(tqdm(kf.split(np.arange(len(X))), total=kf.get_n_splits(), desc=f"{nfolds}-fold")):
    train_x, train_y, test_x, test_y = X[train_idx], y[train_idx], X[test_idx], y[test_idx],
    mm = np.concatenate((mm, test_idx))
    probs = clf.fit(train_x, train_y).predict(test_x)
    preds = (probs > 0.5) * 1
    cm = confusion_matrix(test_y, preds)
    cma += cm
    predictions = np.concatenate((predictions, preds))
    scores = scores.append(pd.DataFrame([[accuracy_score(test_y, preds), balanced_accuracy_score(test_y, preds), 
        cm[0,0]/(cm[0,0]+cm[0,1]), cm[1,1]/(cm[1,0]+cm[1,1]), 
        matthews_corrcoef(test_y, preds), cm]], columns=columns_names, index=[fold]))
df_scores = pd.DataFrame(scores.mean(axis=0)).T
df_scores.index=[f'{method}']
df_scores['CM'] = [cma]
df_scores

5-fold: 100%|██████████| 5/5 [00:01<00:00,  3.14it/s]


Unnamed: 0,Accuracy,BA,Sensitivity,Specificity,MCC,CM
XGB,0.857109,0.697006,0.436824,0.957187,0.484608,"[[330, 415], [130, 2939]]"


In [46]:
p = np.zeros(len(y))
p[mm] = predictions
labels['predictions'] = ['NE' if x>0 else 'E' for x in p]
labels

Unnamed: 0_level_0,CS0_vs_CS6-9,predictions
name,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000001036,NE,NE
ENSG00000001461,NE,NE
ENSG00000001561,NE,NE
ENSG00000001630,NE,NE
ENSG00000001631,NE,NE
...,...,...
ENSG00000288283,NE,NE
ENSG00000288359,NE,NE
ENSG00000288407,NE,NE
ENSG00000288478,NE,NE
