In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import metrics

def get_x(df):
    return df.loc[:, df.columns != 'class']
    
def get_y(df):
    return df.loc[:, ['class']].T.values[0]

def get_probs_positive_class(pred):
        positive_probs = []

        for prediction in pred:
            positive_probs.append(prediction[1])
        return positive_probs
    

def load_dataframes(train_path, test_path):
    train_df = pd.read_csv(train_path, index_col=0)
    test_df = pd.read_csv(test_path, index_col=0)
    return train_df, test_df


def cross_testing_accuracy(train_df, test_df, genes_ranking_path, thresholds):
    gdf = pd.read_csv(genes_ranking_path, index_col=0)
    genes = list(gdf.index)
    
    for threshold in thresholds:
        print("Cross-testing with threshold:", threshold)
        #clf = SVC(gamma='auto', probability=True)
        clf = GBC()
        
        tr_top = train_df.loc[:, genes[0:threshold]+['class']]
        ts_top = test_df.loc[:, genes[0:threshold]+['class']]

        X_train = get_x(tr_top)
        y_train = get_y(tr_top)

        clf.fit(X_train, y_train)


        X_test = get_x(ts_top)
        y_test = get_y(ts_top)

        acc_score = clf.score(X_test, y_test)

        pred = clf.predict_proba(X_test)
        y_pred = np.argmax(pred, axis=1)
        pred = get_probs_positive_class(pred)

        roc_auc = metrics.roc_auc_score(np.array(y_test, dtype=int), pred)
        pred_accuracy = metrics.accuracy_score(y_test, y_pred)

        print("Accuracy with .score:", acc_score)
        print("Accuracy with .predict_proba:", pred_accuracy)
        print("ROC AUC:", roc_auc)
        print("\n")

In [2]:
ths = [1, 5, 10, 15, 20, 50, 100]

genes_rank = "/home/colombelli/Documents/experiments/Experiments22_ago/luad/hyb_borda_borda/selection/agg_ranking_th1.csv"
train_df_path = "/home/colombelli/Documents/datasets/assembler/luad/intersect.csv"

test_df1_path = "/home/colombelli/Documents/datasets/cumida/lung/GSE18842/intersect.csv"
test_df2_path = "/home/colombelli/Documents/datasets/cumida/lung/GSE19804/intersect.csv"
test_df3_path = "/home/colombelli/Documents/datasets/cumida/lung/GSE27262/intersect.csv"

In [3]:
tr_df, ts_df = load_dataframes(train_df_path, test_df1_path)
ts_df2, ts_df3 = load_dataframes(test_df2_path, test_df3_path)

In [4]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, ths)

Using GSE18842 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.5222222222222223
Accuracy with .predict_proba: 0.5222222222222223
ROC AUC: 0.5343379446640316


Cross-testing with threshold: 5
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5113636363636364


Cross-testing with threshold: 10
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 15
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 20
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 50
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 100
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba:

In [7]:
genes_rank = "/home/colombelli/Documents/experiments/Experiments22_ago/luad/hom_gr/selection/agg_ranking_th1.csv"

In [8]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, ths)

Using GSE18842 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 5
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 10
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 15
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 20
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 50
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 100
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0

In [9]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, [1000, 5000, 10000])

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, [1000, 5000, 10000])

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, [1000, 5000, 10000])

Using GSE18842 as testing dataframe...

Cross-testing with threshold: 1000
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 5000
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 10000
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Using GSE19804 as testing dataframe...

Cross-testing with threshold: 1000
Accuracy with .score: 0.49122807017543857
Accuracy with .predict_proba: 0.49122807017543857
ROC AUC: 0.5


Cross-testing with threshold: 5000
Accuracy with .score: 0.49122807017543857
Accuracy with .predict_proba: 0.49122807017543857
ROC AUC: 0.5


Cross-testing with threshold: 10000
Accuracy with .score: 0.49122807017543857
Accuracy with .predict_proba: 0.49122807017543857
ROC AUC: 0.5


Using GSE27262 as testing dataframe...

Cross-testing with threshold: 10

In [10]:
path_ranking_fold = "/home/colombelli/Documents/experiments/Experiments22_ago/luad/hom_gr/fold_%s/agg_ranking_th1.csv"
rankings = []
for f in range(1,6):
    p = path_ranking_fold % str(f)
    gdf = pd.read_csv(p, index_col=0)
    rankings.append(gdf)
    

In [12]:
aggregated_ranking = {} 
for gene in rankings[0].index:
    aggregated_ranking[gene] = 0

for ranking in rankings:
    for gene in ranking.index: 
        aggregated_ranking[gene] += ranking.index.get_loc(gene)+1 


final_ranking = pd.DataFrame.from_dict(aggregated_ranking, orient='index')
final_ranking.columns = ['rank']
r = final_ranking.sort_values(by='rank')

In [14]:
def cross_testing_accuracy(train_df, test_df, gdf, thresholds):
    genes = list(gdf.index)
    
    for threshold in thresholds:
        print("Cross-testing with threshold:", threshold)
        clf = GBC()
        
        tr_top = train_df.loc[:, genes[0:threshold]+['class']]
        ts_top = test_df.loc[:, genes[0:threshold]+['class']]

        X_train = get_x(tr_top)
        y_train = get_y(tr_top)

        clf.fit(X_train, y_train)


        X_test = get_x(ts_top)
        y_test = get_y(ts_top)

        acc_score = clf.score(X_test, y_test)

        pred = clf.predict_proba(X_test)
        y_pred = np.argmax(pred, axis=1)
        pred = get_probs_positive_class(pred)

        roc_auc = metrics.roc_auc_score(np.array(y_test, dtype=int), pred)
        pred_accuracy = metrics.accuracy_score(y_test, y_pred)

        print("Accuracy with .score:", acc_score)
        print("Accuracy with .predict_proba:", pred_accuracy)
        print("ROC AUC:", roc_auc)
        print("\n")

In [15]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, r, ths)

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, r, ths)

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, r, ths)

Using GSE18842 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 5
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 10
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 15
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 20
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 50
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 100
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0