[![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giordamaug/EG-identification---Data-Science-in-App-Springer/blob/main/notebook/EssentialGenes_Regression.ipynb)
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/giordamaug/EG-identification---Data-Science-in-App-Springer/main?filepath=notebook%2FEssentialGenes_Regression.ipynb)

# Loading required libraries

In [30]:
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    !pip install -q karateclub
    !pip install -q pandas
    !pip install -q sklearn
    !pip install -q imblearn
    !pip install -q xgboost
    !pip install -q tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')
import random
import numpy as np
import pandas as pd
def set_seed(seed=1):
    random.seed(seed)
    np.random.seed(seed)

# Download dataset from Github

In [3]:
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/ppi.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/labels.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/bio_attributes.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/net_attributes.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/gtex_attributes.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/ppi+met.csv
!wget https://raw.githubusercontent.com/giordamaug/EG-identification---Data-Science-in-App-Springer/main/data/scores.csv

zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget
zsh:1: command not found: wget


# Load the label
Only a subset of genes are selected for classification:
+ genes belonging to CS0 group, that are labeled as Essential (E);
+ genes belonging to CS6, CS7, ..., CS9 groups, that are labeled as Not-Essential (NE).

All remaining genes belong to intermediate groups (CS1-CS5) and are considered undetermined (label ND) 

In [4]:
datapath='.'
scores = pd.read_csv(f"{datapath}/scores.csv", index_col='name')
genes = scores.index.values                                             # get genes with defined labels (E or NE)
scores = scores.reset_index()                                           # reindex genes by consecutive integers
scores['index'] = scores.index
gene2idx_mapping = { v[1] : v[0]  for v in scores[['index', 'name']].values }           # create mapping index by gene name
idx2gene_mapping = { v[0] : v[1]  for v in scores[['index', 'name']].values }           # create mapping index by gene name
y = scores.drop(columns=['name','index'])
print(f'Selected {len(genes)} genes')

Selected 11696 genes


# Load attributes to be used
We identified three sets of attributes:
1. bio attributes, related to gene information (such as, expression, etc.)
2. net attributes, derived from role of gene/node in the network (such as, degree, centrality, etc.)
3. GTEX-* attribute, additional biological information of genes 
Based on user selection, the node attributes are appended in a single matrix of attributes (`x`)

In the attribute matrix `x` there can be NaN or Infinite values. They are corrected as it follow:
+ NaN is replaced by the mean in the attribute range, 
+ Infinte value is replaced by the maximum in the range.

After Nan and Infinite values fixing, the attributes are normalized with Z-score or MinMax normalization functions.

At the end, only nodes (genes) with E or NE labels are selected for the classification

In [5]:
#@title Choose attributes { form-width: "20%" }
normalize_node = "zscore" #@param ["", "zscore", "minmax"]
bio = True #@param {type:"boolean"}
gtex = True #@param {type:"boolean"}
net = False #@param {type:"boolean"}
variable_name = "bio"
bio_df = pd.read_csv(f"{datapath}/bio_attributes.csv", index_col='name') if bio else pd.DataFrame()
gtex_df = pd.read_csv(f"{datapath}/gtex_attributes.csv", index_col='name') if gtex else pd.DataFrame()
net_df = pd.read_csv(f"{datapath}/net_attributes.csv", index_col='name') if net else pd.DataFrame()
x = pd.concat([bio_df, gtex_df, net_df], axis=1)
print(f'Found {x.isnull().sum().sum()} NaN values and {np.isinf(x).values.sum()} Infinite values')
for col in x.columns[x.isna().any()].tolist():
  mean_value=x[col].mean()          # Replace NaNs in column with the mean of values in the same column
  if mean_value is not np.nan:
    x[col].fillna(value=mean_value, inplace=True)
  else:                             # otherwise, if the mean is NaN, remove the column
    x = x.drop(col, 1)
if normalize_node == 'minmax':
  print("X attributes normalization (minmax)...")
  x = (x-x.min())/(x.max()-x.min())
elif normalize_node == 'zscore':
  print("X attributes normalization (zscore)...")
  x = (x-x.mean())/x.std()
x = x.loc[genes]
x["index"] = x.index
x = x.replace({"index": gene2idx_mapping})
x = x.set_index('index')
print(f'New attribute matrix x{x.shape}')


Found 15705 NaN values and 0 Infinite values
X attributes normalization (zscore)...
New attribute matrix x(11696, 105)


# Load the PPI+MET network
The PPI networks is loaded from a CSV file, where
*   `A` is the column name for edge source (gene name)
*   `B` is the column name for edge target (gene name)
*   `weight` is the column name for edge weight
Only some method use the PPI netoworks, as an example all GCN methods, and Node2Vec.

In [6]:
import networkx as nx
ppi = pd.read_csv(f'{datapath}/ppi+met.csv')                               # read PPI+MET network from CSV file
ppi = ppi.loc[((ppi['A'].isin(genes)) & (ppi['B'].isin(genes)))]           # reduce network only to selected nodes/genes
ppi = ppi.replace({"A": gene2idx_mapping, "B": gene2idx_mapping})          # replace gene name in ppi

In [7]:
edge_list = [list(v) for v in list(ppi[['A','B', 'weight']].values)]      # get the edge list (with weights)
G = nx.Graph()
G.add_nodes_from(range(len(genes)))                                       # add all nodes (genes, also isolated ones)
G.add_weighted_edges_from(edge_list)                                      # add all edges
print(nx.info(G))
print(f"There are {len(list(nx.isolates(G)))} isolated genes")

Graph with 11696 nodes and 107513 edges
There are 8006 isolated genes


## Node2vec embedding

In [8]:
import karateclub as kc
method = 'Node2Vec'
params = {"walk_number": 10, 
          "walk_length": 80, 
          "p": 1.0, 
          "q": 1.0, 
          "dimensions": 128, 
          "workers": 4, 
          "window_size": 5, 
          "epochs": 1, 
          "learning_rate": 0.05, 
          "min_count": 1, 
          "seed": 42}
n2v = kc.Node2Vec(**params)    # choose here your embedding method
n2v.fit(G)
embedding = n2v.get_embedding()

## Convert embedding to dataframe and append it to attributes

In [13]:
embedding_df = pd.DataFrame(embedding, columns = [f'{method}_' + str(i + 1)  for i in range(embedding.shape[1])])
z = pd.concat([embedding_df, x], axis=1)
z

Unnamed: 0,XGB_1,XGB_2,XGB_3,XGB_4,XGB_5,XGB_6,XGB_7,XGB_8,XGB_9,XGB_10,...,GTEX-1497J-0826-SM-5NQAJ,GTEX-1A3MW-2226-SM-73KUX,GTEX-1K9T9-1826-SM-CXZK2,GTEX-REY6-1826-SM-EAZAT,GTEX-1JMQK-1926-SM-CJI3B,GTEX-QLQW-1626-SM-CMKFE,GTEX-T5JC-1626-SM-EZ6KW,GTEX-RU72-1926-SM-EAZ3F,GTEX-R55E-2026-SM-EZ6L1,GTEX-TKQ2-0626-SM-EZ6LB
0,0.831916,-0.875206,1.562986,1.344833,0.083874,-1.569096,1.226757,1.730765,1.047042,0.415607,...,-0.047211,-0.078238,-0.037539,-0.033513,-0.048795,-0.058642,-0.057140,-0.043311,-0.058015,-0.046366
1,1.586684,-0.625365,1.501325,1.664786,-0.195935,-2.343094,0.944421,2.131194,1.560687,0.696708,...,0.092198,0.638042,0.013663,-0.020666,0.438626,0.439555,0.410866,0.039279,0.229778,0.080912
2,1.512006,-0.633814,1.623401,1.752944,-0.225951,-2.370446,1.012689,2.129643,1.627544,0.669405,...,-0.009839,0.044456,-0.017723,-0.024342,-0.006470,-0.008081,0.000324,-0.024102,-0.035819,-0.025119
3,1.557038,-0.682262,1.655204,1.768889,-0.183884,-2.410138,1.042857,2.182709,1.632284,0.694196,...,-0.051698,-0.077213,-0.042488,-0.035035,-0.050080,-0.056835,-0.057243,-0.044777,-0.051481,-0.045628
4,2.530112,-0.654009,0.960374,0.225985,-1.025861,-3.003521,-0.791186,1.031723,0.886711,2.031643,...,-0.027831,-0.021116,-0.021738,-0.028349,-0.024094,-0.026113,-0.026702,-0.024048,0.018506,-0.018342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11691,-0.098806,0.098325,-0.135349,0.006235,0.164256,-0.142482,-0.110952,0.007192,0.183228,0.123566,...,-0.050178,-0.078022,-0.042844,-0.035514,-0.050040,-0.057313,-0.057023,-0.044749,-0.057915,-0.046715
11692,0.118535,-0.084889,0.057141,-0.158630,-0.264094,0.053916,0.011378,0.079411,-0.085551,0.033139,...,-0.045890,-0.067169,-0.038599,-0.034448,-0.046173,-0.053460,-0.053283,-0.041398,-0.049632,-0.042863
11693,0.109180,-0.096923,0.073025,-0.160119,-0.279483,0.068111,0.016523,0.082999,-0.093696,0.036240,...,0.009073,0.106478,-0.021237,-0.026583,0.022160,0.040422,0.072483,-0.003135,-0.003578,0.000643
11694,0.257060,0.038508,0.030175,-0.007967,0.152779,-0.065031,-0.114082,-0.039099,0.097577,-0.140452,...,-0.042815,-0.062011,-0.038395,-0.033848,-0.044726,-0.049332,-0.050442,-0.041550,-0.043314,-0.042168


# Regression


In [28]:
#@title Choose classifier { run: "auto", form-width: "20%" }
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RepeatedKFold, cross_val_score
from tqdm import tqdm
from sklearn.metrics import *
import sys
sys.path.append('.')
from multiscorer import MultiScorer
from numpy import absolute,average
seed=1
set_seed(seed)

scorer = MultiScorer({
    'neg_mean_absolute_error'  : (mean_absolute_error , {}),
    'R2' : (r2_score, {}),
})


X = z.to_numpy()
Y = y.values
cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# define model
model = LinearRegression()
#model = MultiOutputRegressor(Ridge(random_state=123))

# evaluate model
cross_val_score(model, X, Y, scoring=scorer, cv=cv)
results = scorer.get_results()
# force scores to be positive
for metric in results.keys():                                        # Iterate and use the results
  print("%s: %.3f" % (metric, average(results[metric])))


neg_mean_absolute_error: 0.266
R2: 0.077
