In [13]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from joblib import Parallel, delayed
from tqdm import tqdm
import gzip
from HELPpy.utility.utils import pandas_readcsv

# Set working directory
#os.chdir("/to/CLEARER_directory/")
EsInfo = pd.read_csv("Class_labels/Hs.csv", sep=",", header=0)
print(EsInfo.head())
print(EsInfo['Essential CEG'].value_counts())

# Generate class labels suitable for Python
EsInfo['Essential CEG'] = EsInfo['Essential CEG'].dropna().astype('category').cat.codes
print(EsInfo['Essential CEG'].value_counts())

# Load combined features
Data = pd.read_csv("Features/Hs_features.csv.gz", sep=",", header=0, compression='gzip')
Data.set_index('genes', inplace=True)

# Assign class labels
Data['label'] = EsInfo.set_index('Gene').loc[Data.index, 'Essential CEG']

# Randomize Data
np.random.seed(69)
Data = Data.sample(frac=1).reset_index()

# Split Data
N = 5
seq = np.round(np.linspace(0, len(Data), N+1)).astype(int)
val_sets = [Data.iloc[seq[i]:seq[i+1]].set_index('genes') for i in range(N)]
train_sets = [Data.set_index('genes').drop(val.index) for val in val_sets]

              Gene Essential CEG  Essential OEG
0  ENSG00000107581     Essential      Essential
1  ENSG00000068654     Essential            NaN
2  ENSG00000088325     Essential            NaN
3  ENSG00000148835           NaN      Essential
4  ENSG00000165732     Essential  Non-essential
Essential CEG
Non-essential    13743
Essential          833
Name: count, dtype: int64
Essential CEG
1.0    13743
0.0      833
Name: count, dtype: int64


In [7]:
X_train = train_sets[0].iloc[:, :-1]
y_train = train_sets[0].iloc[:, -1]
from HELPpy.models.prediction import VotingEnsembleLGBM, k_fold_cv
clf = VotingEnsembleLGBM(n_voters=10, learning_rate=0.5, boosting_type='gbdt', n_jobs=-1, random_state=42)
df_scores, scores, predictions = k_fold_cv(X_train, y_train, clf, n_splits=5, seed=0, show_progress=True, verbose=True)
df_scores

{-1: 0, 0: 1, 1: 2}
label
 1    11043
-1     4230
 0      658
Name: count, dtype: int64
Classification with VotingEnsembleLGBM...


5-fold:   0%|          | 0/5 [00:00<?, ?it/s]

ValueError: Number of classes in y_true not equal to the number of columns in 'y_score'

In [4]:
Data

Unnamed: 0,genes,T3s,C3s,A3s,G3s,CAI.x,CBI,Fop,Nc,GC3s,...,Pc2.Hydrophilicity.26,Pc2.Hydrophobicity.27,Pc2.Hydrophilicity.27,Pc2.Hydrophobicity.28,Pc2.Hydrophilicity.28,Pc2.Hydrophobicity.29,Pc2.Hydrophilicity.29,Pc2.Hydrophobicity.30,Pc2.Hydrophilicity.30,label
0,ENSG00000197976,1,8,1,10,1,9,9,10,10,...,2,2,2,2,2,2,2,2,2,1
1,ENSG00000006756,4,7,5,6,3,5,5,10,6,...,7,7,7,7,7,7,7,7,7,1
2,ENSG00000137310,3,7,5,7,2,7,8,9,7,...,7,7,7,7,7,7,7,7,7,1
3,ENSG00000230430,2,10,3,9,10,10,10,6,9,...,5,5,5,5,5,5,5,5,5,-1
4,ENSG00000162614,7,3,10,3,7,3,4,4,3,...,10,10,10,9,10,10,10,10,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19909,ENSG00000169752,9,2,10,2,5,2,2,2,2,...,2,9,2,2,2,2,1,2,1,1
19910,ENSG00000197674,7,4,10,1,10,5,5,2,3,...,6,6,6,6,6,6,6,6,6,-1
19911,ENSG00000235588,3,9,5,4,1,3,2,3,7,...,7,7,7,7,7,7,7,7,7,-1
19912,ENSG00000129932,2,9,2,9,2,9,9,10,9,...,2,9,9,2,2,10,10,1,1,0


# Feature selection

In [42]:
# Feature selection
for i in tqdm(range(N), desc="Feature selection step", total=N):
    train_set = train_sets[i]
    X_train = train_set.iloc[:, :-1]
    y_train = train_set.iloc[:, -1]

    # Lasso feature selection using LogisticRegressionCV
    clf = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', scoring='roc_auc', max_iter=1000).fit(X_train, y_train)
    model = SelectFromModel(clf, prefit=True)
    X_train_selected = model.transform(X_train)

    # Remove highly correlated features
    corr_matrix = pd.DataFrame(X_train_selected).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
    X_train_selected = pd.DataFrame(X_train_selected).drop(to_drop, axis=1)

    train_sets[i] = pd.concat([X_train_selected, y_train.reset_index(drop=True)], axis=1)
    val_sets[i] = val_sets[i][train_sets[i].columns]



In [3]:
# Machine learning
def train_rf(train_data):
    X = train_data.iloc[:, :-1]
    y = train_data.iloc[:, -1]
    
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    rf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
    scores = cross_val_score(rf, X_resampled, y_resampled, cv=kf, scoring='roc_auc')
    rf.fit(X_resampled, y_resampled)
    return rf, scores.mean()

results = Parallel(n_jobs=N)(delayed(train_rf)(train_sets[i]) for i in range(N))
rf_list, auc_scores = zip(*results)

# Performance evaluation on test set
def evaluate_model(rf, val_data):
    X_val = val_data.iloc[:, :-1]
    y_val = val_data.iloc[:, -1]
    
    y_pred = rf.predict(X_val)
    y_prob = rf.predict_proba(X_val)[:, 1]
    
    cm = confusion_matrix(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_prob)
    precision, recall, _ = precision_recall_curve(y_val, y_prob)
    pr_auc = auc(recall, precision)
    
    return cm, roc_auc, pr_auc

eval_results = [evaluate_model(rf_list[i], val_sets[i]) for i in range(N)]
cm_list, roc_auc_list, pr_auc_list = zip(*eval_results)

metrics = pd.DataFrame({
    'roc_auc': roc_auc_list,
    'pr_auc': pr_auc_list
})

metrics.loc['mean'] = metrics.mean()
metrics.loc['std'] = metrics.std()

metrics.to_csv("test_rf.csv", index=True)

# Performance evaluation on training set
def get_best_kappa(rf):
    results = pd.DataFrame(rf.cv_results_)
    return results.loc[results['mean_test_score'].idxmax()]

train_metrics = pd.concat([get_best_kappa(rf_list[i]) for i in range(5)])
train_metrics.to_csv("train_rf.csv", index=False)