<a target="_blank" href="https://colab.research.google.com/github/giordamaug/HELP/blob/main/HELPpy/notebooks/prediction.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://www.kaggle.com/notebooks/welcome?src=https://github.com/giordamaug/HELP/blob/main/HELPpy/notebooks/prediction.ipynb">
  <img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Open In Colab"/>
</a>

# Install HELP from GitHub
Skip this cell if you already have installed HELP.

In [None]:
!pip install git+https://github.com/giordamaug/HELP.git

# Download the input files
In this cell we download from GitHub repository the label file and the attribute files. Skip this step if you already have these input files locally.

In [None]:
tissue='Kidney'
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_HELP.csv
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_BIO.csv
for i in range(5):
  !wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_CCcfs_{i}.csv
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/data/{tissue}_EmbN2V_128.csv

In [1]:
%cd ../../data

/Users/maurizio/HELP/data


# Process the tissue attributes
In this code we load tissue gene attributes by several datafiles. We apply missing values fixing and data scaling with `sklearn.preprocessing.StandardScaler` on the `BIO` and `CCcfs` attributes, while no normalization and fixing on embedding attributes (`EmbN2V_128`). The attributes are all merged in one matrix by the `feature_assemble` function as input for the prediction model building.

In [3]:
tissue='Kidney'
import pandas as pd
from HELPpy.preprocess.loaders import feature_assemble_df
import os
df_y = pd.read_csv(f"{tissue}_HELP.csv", index_col=0)
df_y = df_y.replace({'aE': 'NE', 'sNE': 'NE'})    # E vs NE problem
#df_y = df_y[df_y['label'].isin(['aE']) == False]   # E vs sNE problem
#df_y = df_y[df_y['label'].isin(['E']) == False]    # aE vs sNE problem
#df_y = df_y[df_y['label'].isin(['sNE']) == False]  # E vs aE problem
print(df_y.value_counts(normalize=False))
features = [{'fname': f'{tissue}_BIO.csv', 'fixna' : False, 'normalize': 'std'},
            {'fname': f'{tissue}_CCcfs.csv', 'fixna' : False, 'normalize': 'std', 'nchunks' : 5},
            {'fname': f'{tissue}_EmbN2V_128.csv', 'fixna' : False, 'normalize': None}
            ]
df_X, df_y = feature_assemble_df(df_y, features=features, verbose=True)
print(df_y.value_counts(normalize=False))
pd.merge(df_X, df_y, left_index=True, right_index=True, how='outer')

label
NE       16678
E         1253
dtype: int64
Majority NE 16678 minority E 1253
[Kidney_BIO.csv] found 52532 Nan...
[Kidney_BIO.csv] Normalization with std ...


Loading file in chunks: 100%|██████████| 5/5 [00:08<00:00,  1.77s/it]


[Kidney_CCcfs.csv] found 6676644 Nan...
[Kidney_CCcfs.csv] Normalization with std ...
[Kidney_EmbN2V_128.csv] found 0 Nan...
[Kidney_EmbN2V_128.csv] No normalization...
17236 labeled genes over a total of 17931
(17236, 3456) data input
label
NE       15994
E         1242
dtype: int64


Unnamed: 0,Gene length,Transcripts count,GC content,GTEX_kidney,Gene-Disease association,OncoDB_expression,HPA_kidney,GO-MF,GO-BP,GO-CC,...,Node2Vec_119,Node2Vec_120,Node2Vec_121,Node2Vec_122,Node2Vec_123,Node2Vec_124,Node2Vec_125,Node2Vec_126,Node2Vec_127,label
A1BG,0.003351,0.020942,0.501832,2.044542e-05,0.002950,,0.000002,,,0.115385,...,0.120922,-0.352630,0.580697,-0.659300,-1.320486,1.019308,-0.469064,0.123211,0.557266,NE
A1CF,0.034865,0.047120,0.160530,1.980884e-05,,0.556939,0.000232,0.069767,0.041026,0.096154,...,-1.162494,0.155702,-1.162071,0.534082,0.798872,0.149595,-0.360515,-1.060540,-0.408493,NE
A2M,0.019624,0.062827,0.176932,3.377232e-03,0.073746,0.584540,0.005382,0.302326,0.056410,0.076923,...,0.150766,1.492019,0.209449,-1.034729,-0.064318,0.029690,0.138344,0.806095,-0.496128,NE
A2ML1,0.026017,0.041885,0.299948,5.123403e-07,0.017699,,0.000000,0.069767,0.005128,0.038462,...,0.191344,-0.542462,0.746510,0.082089,-1.109212,0.406936,-1.332319,-0.363864,0.443284,NE
A3GALT2,0.005784,0.000000,0.473739,1.421472e-06,,0.663540,0.000000,0.069767,0.015385,0.057692,...,0.483003,-0.197605,0.164332,0.040729,-0.552362,0.242761,0.223486,0.017539,-0.526580,NE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.021209,0.010471,0.288257,7.073108e-06,,0.634761,0.000055,,,0.000000,...,-0.717935,-0.072597,0.585837,0.172081,-0.278010,0.170799,0.267462,-0.211294,-0.940943,NE
ZYG11B,0.040775,0.005236,0.248648,7.271294e-05,,0.646090,0.000238,0.000000,0.005128,0.000000,...,0.372134,0.007040,-0.278071,-1.309595,-0.352476,0.732887,0.156505,0.516706,-0.412953,NE
ZYX,0.003958,0.047120,0.539522,8.282866e-04,,0.672638,0.000177,0.046512,0.035897,0.153846,...,-0.316321,-0.382132,0.400354,0.322564,0.400369,0.188850,0.593201,-0.093008,-0.508902,NE
ZZEF1,0.056017,0.052356,0.304484,9.626291e-05,,,0.000121,0.093023,,,...,-0.520060,-0.000595,-0.101278,-0.468345,0.240905,-0.124018,0.568793,-0.422793,-0.701705,NE


# Prediction
We process k-fold cross validation of a LightGBM classifier (`n_splits=5`), and then we store predictions and print metrics.

In [4]:
from HELPpy.models.prediction import predict_cv_sv
df_scores, predictions = predict_cv_sv(df_X, df_y, n_voters=3, n_splits=5, balanced=True, verbose=True)
df_scores

Majority NE 15994, minority E 1242
{'E': 0, 'NE': 1}
label
NE       5332
E        1242
dtype: int64
Classification with LGBM...


5-fold:   0%|          | 0/5 [00:00<?, ?it/s]

In [42]:
predictions.loc[np.intersect1d(csEGs, predictions.index.values)]

Unnamed: 0_level_0,label,prediction,probabilities
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACTG1,0,0,0.857604
ACTR6,0,0,0.733652
ARF4,0,0,0.582781
ARFRP1,0,1,0.033001
ARPC4,0,0,0.74311
CDK6,0,0,0.72835
CFLAR,0,1,0.15138
CHMP7,0,0,0.970504
COPS3,0,0,0.972604
DCTN3,0,0,0.980152


## True Positive rates of context-specific EGs

In [48]:
import numpy as np
csEGs = pd.read_csv("csEG_Kidney.txt", index_col=0, header=None).index.values
indices = np.intersect1d(csEGs, predictions.index.values)
predictions = predictions.loc[indices]
num = len(predictions[predictions['label'] == predictions['prediction']])
den = len(predictions)
print(f"csEG Kidney TPR = {num /den:.3f} ({num}/{den})")

csEG Kidney TPR = 0.780 (46/59)
