In [27]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import metrics

import seaborn as sns

def get_x(df):
    return df.loc[:, df.columns != 'class']
    
def get_y(df):
    return df.loc[:, ['class']].T.values[0]

def get_probs_positive_class(pred):
        positive_probs = []

        for prediction in pred:
            positive_probs.append(prediction[1])
        return positive_probs
    

def load_dataframes(train_path, test_path):
    train_df = pd.read_csv(train_path, index_col=0)
    test_df = pd.read_csv(test_path, index_col=0)
    return train_df, test_df


def cross_testing_accuracy(train_df, test_df, genes_ranking_path, thresholds):
    gdf = pd.read_csv(genes_ranking_path, index_col=0)
    genes = list(gdf.index)
    
    for threshold in thresholds:
        print("Cross-testing with threshold:", threshold)
        #clf = SVC(gamma='auto', probability=True)
        clf = GBC()
        
        tr_top = train_df.loc[:, genes[0:threshold]+['class']]
        ts_top = test_df.loc[:, genes[0:threshold]+['class']]

        X_train = get_x(tr_top)
        y_train = get_y(tr_top)

        clf.fit(X_train, y_train)


        X_test = get_x(ts_top)
        y_test = get_y(ts_top)

        acc_score = clf.score(X_test, y_test)

        pred = clf.predict_proba(X_test)
        y_pred = np.argmax(pred, axis=1)
        pred = get_probs_positive_class(pred)

        roc_auc = metrics.roc_auc_score(np.array(y_test, dtype=int), pred)
        pred_accuracy = metrics.accuracy_score(y_test, y_pred)

        print("Accuracy with .score:", acc_score)
        print("Accuracy with .predict_proba:", pred_accuracy)
        print("ROC AUC:", roc_auc)
        print("Predictions:", y_pred)
        print("\n")

In [2]:
ths = [1, 5, 10, 15, 20, 50, 100]

genes_rank = "/home/colombelli/Documents/experiments/Experiments22_ago/luad/hyb_borda_borda/selection/agg_ranking_th1.csv"
train_df_path = "/home/colombelli/Documents/datasets/assembler/luad/intersect.csv"

test_df1_path = "/home/colombelli/Documents/datasets/cumida/lung/GSE18842/intersect.csv"
test_df2_path = "/home/colombelli/Documents/datasets/cumida/lung/GSE19804/intersect.csv"
test_df3_path = "/home/colombelli/Documents/datasets/cumida/lung/GSE27262/intersect.csv"

In [3]:
tr_df, ts_df = load_dataframes(train_df_path, test_df1_path)
ts_df2, ts_df3 = load_dataframes(test_df2_path, test_df3_path)

In [22]:
tr_df = pd.concat([tr_df.loc[tr_df['class']==0],
            tr_df.loc[tr_df['class']==1][0:60]]).sample(frac = 1)
tr_df

Unnamed: 0,HOTAIR,C9orf43,INSC,CYP11B1,TGFA,RASL11A,MTR,RORB,TOB2,DAGLA,...,TCTN3,SDHA,TMEM60,NEO1,ASAP2,WDTC1,NOTCH2,MPP5,C12orf50,class
TCGA-91-6847-11A-01R-1949-07,-1.669273,0.283067,1.533626,-0.334852,-1.147406,1.966083,0.438933,1.966083,1.360216,1.520973,...,-0.470315,-2.866181,-1.276238,2.009960,0.822394,4.527235,-0.888177,4.190568,-1.021897,0
TCGA-44-3396-11A-01R-1758-07,-0.056484,0.965807,3.271308,-0.334852,-1.112869,1.783238,1.147406,1.336051,4.190568,0.724884,...,-0.190178,-1.147406,-3.469926,2.424554,2.161528,2.568030,1.729107,1.193841,2.192916,0
TCGA-05-4417-01A-22R-1858-07,0.681949,-1.922873,-0.066756,-0.334852,-2.721650,-0.293412,0.407629,3.069411,-0.077030,2.372999,...,0.778894,-0.639237,-0.988176,1.951607,-1.824457,-1.571850,0.005134,-1.756057,2.372999,1
TCGA-05-4405-01A-21R-1858-07,2.531286,0.097581,-3.662115,-0.334852,-3.662115,2.115146,5.325413,-0.128423,1.170567,1.675868,...,1.182189,0.768060,1.636491,-1.433586,-2.866181,0.822394,1.193841,-0.210789,-1.021897,1
TCGA-44-2668-01A-01R-0946-07,0.639237,0.910256,-0.345228,-0.334852,2.192916,-3.662115,1.445948,-0.778894,1.396733,-0.899207,...,0.221101,0.855203,-4.663069,-3.069411,1.922873,-1.980633,4.663069,0.800610,-1.021897,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-44-6146-11A-01R-1858-07,-1.669273,-1.276238,2.844879,-0.334852,-1.995258,0.575555,2.459574,2.256885,3.271308,0.355610,...,-1.182189,-2.240739,-3.531415,2.477288,2.115146,2.130515,2.701821,4.006018,-1.021897,0
TCGA-05-4425-01A-01R-1755-07,-1.669273,1.240741,3.768034,-0.334852,-2.866181,4.593257,-0.501780,-0.272727,-1.559063,-0.800610,...,1.756057,-1.360216,0.015403,-2.099866,3.218613,-3.731891,-0.943524,1.922873,-1.021897,1
TCGA-05-4420-01A-01R-1206-07,3.731891,-1.951607,-1.044493,-0.334852,4.464568,-1.894420,-4.816311,0.087305,-2.782319,3.045706,...,6.056651,-5.816639,3.531415,3.353539,1.470791,1.433586,-1.769618,-0.768060,1.852247,1
TCGA-49-6761-11A-01R-1949-07,-1.669273,-1.021897,4.049876,-0.334852,-2.054541,2.024741,-0.649895,1.021897,3.142447,-1.067186,...,-3.022303,-1.571850,0.822394,2.976366,1.205521,1.675868,-1.336051,2.531286,-1.021897,0


In [28]:
print("Using GSE18842 as testing dataframe...\n")
print("Training and testing cross microarrays:")
cross_testing_accuracy(ts_df2, ts_df, genes_rank, ths)

Using GSE18842 as testing dataframe...

Training and testing cross microarrays:
Cross-testing with threshold: 1
Accuracy with .score: 0.8888888888888888
Accuracy with .predict_proba: 0.8888888888888888
ROC AUC: 0.9802371541501976
Predictions: [1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


Cross-testing with threshold: 5
Accuracy with .score: 0.9666666666666667
Accuracy with .predict_proba: 0.9666666666666667
ROC AUC: 0.9661561264822134
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.9666666666666667
Accuracy with .predict_proba: 0.9666666666666667
ROC AUC: 0.9775197628458498
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1

In [21]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, ths)

Using GSE18842 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [8]:
genes_rank = "/home/colombelli/Documents/experiments/Experiments22_ago/luad/hom_gr/selection/agg_ranking_th1.csv"

In [9]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, ths)

Using GSE18842 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [24]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, [1000, 5000, 10000])

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, [1000, 5000, 10000])

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, [1000, 5000, 10000])

Using GSE18842 as testing dataframe...

Cross-testing with threshold: 1000
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 5000
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 10000
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [25]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, [-1])

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, [-1])

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, [-1])

Using GSE18842 as testing dataframe...

Cross-testing with threshold: -1
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Using GSE19804 as testing dataframe...

Cross-testing with threshold: -1
Accuracy with .score: 0.49122807017543857
Accuracy with .predict_proba: 0.49122807017543857
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]


Using GSE27262 as testing dataframe...

Cross-testing with threshold: -1
Accuracy with .score: 0.5
Accuracy with .predict_proba: 0.5
ROC AUC: 0.5
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [10]:
path_ranking_fold = "/home/colombelli/Documents/experiments/Experiments22_ago/luad/hom_gr/fold_%s/agg_ranking_th1.csv"
rankings = []
for f in range(1,6):
    p = path_ranking_fold % str(f)
    gdf = pd.read_csv(p, index_col=0)
    rankings.append(gdf)
    

In [12]:
aggregated_ranking = {} 
for gene in rankings[0].index:
    aggregated_ranking[gene] = 0

for ranking in rankings:
    for gene in ranking.index: 
        aggregated_ranking[gene] += ranking.index.get_loc(gene)+1 


final_ranking = pd.DataFrame.from_dict(aggregated_ranking, orient='index')
final_ranking.columns = ['rank']
r = final_ranking.sort_values(by='rank')

In [14]:
def cross_testing_accuracy(train_df, test_df, gdf, thresholds):
    genes = list(gdf.index)
    
    for threshold in thresholds:
        print("Cross-testing with threshold:", threshold)
        clf = GBC()
        
        tr_top = train_df.loc[:, genes[0:threshold]+['class']]
        ts_top = test_df.loc[:, genes[0:threshold]+['class']]

        X_train = get_x(tr_top)
        y_train = get_y(tr_top)

        clf.fit(X_train, y_train)


        X_test = get_x(ts_top)
        y_test = get_y(ts_top)

        acc_score = clf.score(X_test, y_test)

        pred = clf.predict_proba(X_test)
        y_pred = np.argmax(pred, axis=1)
        pred = get_probs_positive_class(pred)

        roc_auc = metrics.roc_auc_score(np.array(y_test, dtype=int), pred)
        pred_accuracy = metrics.accuracy_score(y_test, y_pred)

        print("Accuracy with .score:", acc_score)
        print("Accuracy with .predict_proba:", pred_accuracy)
        print("ROC AUC:", roc_auc)
        print("\n")

In [15]:
print("Using GSE18842 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, r, ths)

print("Using GSE19804 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, r, ths)

print("Using GSE27262 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, r, ths)

Using GSE18842 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 5
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 10
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 15
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 20
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 50
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0.5


Cross-testing with threshold: 100
Accuracy with .score: 0.5111111111111111
Accuracy with .predict_proba: 0.5111111111111111
ROC AUC: 0