<a target="_blank" href="https://colab.research.google.com/github/giordamaug/HELP/blob/main/help/notebooks/prediction.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
<a target="_blank" href="https://www.kaggle.com/notebooks/welcome?src=https://github.com/giordamaug/HELP/blob/main/help/notebooks/prediction.ipynb">
  <img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Open In Colab"/>
</a>

# Install HELP from GitHub
Skip this cell if you already have installed HELP.

In [None]:
!pip install git+https://github.com/giordamaug/HELP.git

# Download the input files
In this cell we download from GitHub repository the label file and the attribute files. Skip this step if you already have these input files locally.

In [None]:
tissue='Kidney'
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/help/datafinal/{tissue}_HELP.csv
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/help/datafinal/{tissue}_BIO.csv
for i in range(5):
  !wget https://raw.githubusercontent.com/giordamaug/HELP/main/help/datafinal/{tissue}_CCcfs_{i}.csv
!wget https://raw.githubusercontent.com/giordamaug/HELP/main/help/datafinal/{tissue}_EmbN2V_128.csv

In [6]:
%cd ../../data

/Users/maurizio/HELP/data


# Process the tissue attributes
In this code we load tissue gene attributes by several datafiles. We apply missing values fixing and data scaling with `sklearn.preprocessing.StandardScaler` on the `BIO` and `CCcfs` attributes, while no normalization and fixing on embedding attributes (`EmbN2V_128`). The attributes are all merged in one matrix by the `feature_assemble` function as input for the prediction model building.

In [1]:
%cd ../../data 
tissue='Kidney'
import pandas as pd
from HELPpy.preprocess.loaders import feature_assemble_df
import os
df_y = pd.read_csv(f"{tissue}_HELP.csv", index_col=0)
df_y = df_y.replace({'aE': 'NE', 'sNE': 'NE'})
print(df_y.value_counts(normalize=False))
features = [{'fname': f'{tissue}_BIO.csv', 'fixna' : False, 'normalize': 'std'},
            {'fname': f'{tissue}_CCcfs.csv', 'fixna' : False, 'normalize': 'std', 'nchunks' : 5},
            {'fname': f'{tissue}_EmbN2V_128.csv', 'fixna' : False, 'normalize': None}]
df_X, df_y = feature_assemble_df(df_y, features=features, saveflag=False, verbose=True)

/Users/maurizio/HELP/data
label
NE       16678
E         1253
dtype: int64
Majority NE 16678 minority E 1253
[Kidney_BIO.csv] found 52532 Nan...
[Kidney_BIO.csv] Normalization with std ...


Loading file in chunks: 100%|██████████| 5/5 [00:10<00:00,  2.04s/it]


[Kidney_CCcfs.csv] found 6676644 Nan...
[Kidney_CCcfs.csv] Normalization with std ...
[Kidney_EmbN2V_128.csv] found 0 Nan...
[Kidney_EmbN2V_128.csv] No normalization...
17236 labeled genes over a total of 17931
(17236, 3456) data input


In [1]:
from HELPpy.models.prediction import VotingSplitClassifier, k_fold_cv
clf = VotingSplitClassifier()
#df_scores, scores, predictions = k_fold_cv(df_X, df_y, clf, n_splits=5, verbose=True)
#df_scores

In [2]:
clf

AttributeError: 'VotingSplitClassifier' object has no attribute 'voting'

AttributeError: 'VotingSplitClassifier' object has no attribute 'voting'

In [21]:
predictions

Unnamed: 0_level_0,label,prediction,probabilities
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A2M,1,1,0.016435
A2ML1,1,1,0.001649
AAGAB,1,1,0.230005
AANAT,1,1,0.002823
AARS2,1,0,0.529173
...,...,...,...
ZSCAN9,1,1,0.004752
ZSWIM6,1,1,0.007049
ZUP1,1,0,0.532555
ZYG11A,1,1,0.005995


In [34]:
import numpy as np
csEGs = pd.read_csv("csEG_Kidney.txt", index_col=0, header=None).index.values
indices = np.intersect1d(csEGs, predictions.index.values)
predictions = predictions.loc[indices]
num = len(predictions[predictions['label'] == predictions['prediction']])
den = len(predictions)
print(f"csEG Kidney TPR = {num /den:.3f} ({num}/{den})")

csEG Kidney TPR = 0.644 (38/59)


# Prediction with Soft Voting

In [6]:
import numpy as np
from help.models.prediction import predict_cv
seed=42
df_y_ne = df_y[df_y['label']=='NE']
df_y_e = df_y[df_y['label']=='E']
#df_y_ne = df_y_ne.sample(frac=1, random_state=seed)
n_voters = 7
splits = np.array_split(df_y_ne, n_voters) 
predictions_ne = pd.DataFrame()
predictions_e = pd.DataFrame(index=df_y_e.index)
d=np.empty((len(df_y_e.index),),object)
d[...]=[list() for _ in range(len(df_y_e.index))]
predictions_e['probabilities'] = d
predictions_e['label'] = np.array([0 for idx in df_y_e.index])
predictions_e['prediction'] = np.array([np.nan for idx in df_y_e.index])
for df_index_ne in splits:
    df_x = pd.concat([df_X.loc[df_index_ne.index], df_X.loc[df_y_e.index]])
    df_yy = pd.concat([df_y.loc[df_index_ne.index], df_y_e])
    _, _, preds = predict_cv(df_x, df_yy, n_splits=5, method='LGBM', balanced=True, verbose=True, seed=seed)
    predictions_ne = pd.concat([predictions_ne, preds.loc[df_index_ne.index]])
    r = np.empty((len(df_y_e.index),),object)
    r[...]=[predictions_e.loc[idx]['probabilities'] + [preds.loc[idx]['probabilities']]  for idx in df_y_e.index]
    predictions_e['probabilities'] = r
predictions_e['prediction'] = predictions_e['probabilities'].map(lambda x: 0 if sum(x)/n_voters > 0.5 else 1)
predictions_e['probabilities'] = predictions_e['probabilities'].map(lambda x: sum(x)/n_voters)
predictions = pd.concat([predictions_ne, predictions_e])
predictions.to_csv(f"pred_Kidney_SV_{n_voters}.csv", index=True)

  return bound(*args, **kwds)


{'E': 0, 'NE': 1}
label
NE       1600
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:14<00:00,  2.98s/it]


{'E': 0, 'NE': 1}
label
NE       1600
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:16<00:00,  3.21s/it]


{'E': 0, 'NE': 1}
label
NE       1600
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:15<00:00,  3.12s/it]


{'E': 0, 'NE': 1}
label
NE       1600
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:15<00:00,  3.18s/it]


{'E': 0, 'NE': 1}
label
NE       1599
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:14<00:00,  2.91s/it]


{'E': 0, 'NE': 1}
label
NE       1599
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:14<00:00,  2.96s/it]


{'E': 0, 'NE': 1}
label
NE       1599
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:15<00:00,  3.07s/it]


{'E': 0, 'NE': 1}
label
NE       1599
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:15<00:00,  3.03s/it]


{'E': 0, 'NE': 1}
label
NE       1599
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:13<00:00,  2.74s/it]


{'E': 0, 'NE': 1}
label
NE       1599
E        1242
Name: count, dtype: int64
Classification with LGBM...


5-fold: 100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


In [113]:
predictions_e.to_csv("pred_Kidney_SV.csv", index=True)