In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import metrics

import seaborn as sns

def get_x(df):
    return df.loc[:, df.columns != 'class']
    
def get_y(df):
    return df.loc[:, ['class']].T.values[0]

def get_probs_positive_class(pred):
        positive_probs = []

        for prediction in pred:
            positive_probs.append(prediction[1])
        return positive_probs
    

def load_dataframes(train_path, test_path):
    train_df = pd.read_csv(train_path, index_col=0)
    test_df = pd.read_csv(test_path, index_col=0)
    return train_df, test_df


def cross_testing_accuracy(train_df, test_df, genes_ranking_path, thresholds):
    gdf = pd.read_csv(genes_ranking_path, index_col=0)
    genes = list(gdf.index)
    
    for threshold in thresholds:
        print("Cross-testing with threshold:", threshold)
        #clf = SVC(gamma='auto', probability=True)
        clf = GBC()
        
        tr_top = train_df.loc[:, genes[0:threshold]+['class']]
        ts_top = test_df.loc[:, genes[0:threshold]+['class']]

        X_train = get_x(tr_top)
        y_train = get_y(tr_top)

        clf.fit(X_train, y_train)


        X_test = get_x(ts_top)
        y_test = get_y(ts_top)

        acc_score = clf.score(X_test, y_test)

        pred = clf.predict_proba(X_test)
        y_pred = np.argmax(pred, axis=1)
        pred = get_probs_positive_class(pred)

        roc_auc = metrics.roc_auc_score(np.array(y_test, dtype=int), pred)
        pred_accuracy = metrics.accuracy_score(y_test, y_pred)

        print("Accuracy with .score:", acc_score)
        print("Accuracy with .predict_proba:", pred_accuracy)
        print("ROC AUC:", roc_auc)
        print("Predictions:", y_pred)
        print("\n")

In [2]:
ths = [1, 5, 10, 15, 20, 50, 100, 1000, 5000, -1]
genes_rank = "/home/colombelli/Documents/experiments/Experiments22_ago/luad/hyb_borda_borda/selection/agg_ranking_th1.csv"

p1 = "/home/colombelli/Documents/datasets/LUNG_TEST_NPN_TRUNC/tcga_npn_trunc.csv"
p2 = "/home/colombelli/Documents/datasets/LUNG_TEST_NPN_TRUNC/cumida1_npn_trunc.csv"
p3 = "/home/colombelli/Documents/datasets/LUNG_TEST_NPN_TRUNC/cumida2_npn_trunc.csv"
p4 = "/home/colombelli/Documents/datasets/LUNG_TEST_NPN_TRUNC/cumida3_npn_trunc.csv"

df1 = pd.read_csv(p1, index_col=0)
df2 = pd.read_csv(p2, index_col=0)
df3 = pd.read_csv(p3, index_col=0)
df4 = pd.read_csv(p4, index_col=0)

In [8]:
cross_testing_accuracy(df1, df2, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.5444444444444444
Accuracy with .predict_proba: 0.5444444444444444
ROC AUC: 0.6025197628458497
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.5555555555555556
Accuracy with .predict_proba: 0.5555555555555556
ROC AUC: 0.7737154150197628
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.5777777777777777
Accuracy with .predict_proba: 0.5777777777777777
ROC AUC: 0.6136363636363636
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [9]:
cross_testing_accuracy(df1, df3, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.5964912280701754
Accuracy with .predict_proba: 0.5964912280701754
ROC AUC: 0.7227524630541872
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 0 1 0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0
 1 0 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.5350877192982456
Accuracy with .predict_proba: 0.5350877192982456
ROC AUC: 0.7107450738916256
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 0 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.5526315789473685
Accuracy with .predict_proba: 0.5526315789473685
ROC AUC: 0.6102216748768473
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [10]:
cross_testing_accuracy(df1, df4, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.5833333333333334
Accuracy with .predict_proba: 0.5833333333333334
ROC AUC: 0.8333333333333334
Predictions: [1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.5
Accuracy with .predict_proba: 0.5
ROC AUC: 0.8645833333333333
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.5625
Accuracy with .predict_proba: 0.5625
ROC AUC: 0.5625
Predictions: [1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 15
Accuracy with .score: 0.5625
Accuracy with .predict_proba: 0.5625
ROC AUC: 0.6875
Predictions: [1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 20
Accuracy wi

# Treinando e testando com microarrays

In [11]:
cross_testing_accuracy(df2, df3, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.5701754385964912
Accuracy with .predict_proba: 0.5701754385964912
ROC AUC: 0.6045258620689655
Predictions: [0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 1 0 1 1 1 1
 0 0 1 0 1 1 1 1 0 1 1 1 0 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 1 0 1 1 0 0 0
 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 1 1 1 0 0 0 1 0 0 1 0 1
 0 0 0]


Cross-testing with threshold: 5
Accuracy with .score: 0.9035087719298246
Accuracy with .predict_proba: 0.9035087719298246
ROC AUC: 0.9082512315270935
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
 1 0 0]


Cross-testing with threshold: 10
Accuracy with .score: 0.9035087719298246
Accuracy with .predict_proba: 0.9035087719298246
ROC AUC: 0.9105603448275863
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1

In [14]:
cross_testing_accuracy(df2, df4, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.7291666666666666
Accuracy with .predict_proba: 0.7291666666666666
ROC AUC: 0.8046875000000001
Predictions: [1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 0
 0 1 1 1 1 1 0 0 1 1 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.9375
Accuracy with .predict_proba: 0.9375
ROC AUC: 0.935763888888889
Predictions: [0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.9166666666666666
Accuracy with .predict_proba: 0.9166666666666666
ROC AUC: 0.9513888888888888
Predictions: [0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 15
Accuracy with .score: 0.9166666666666666
Accuracy with .predict_proba: 0.9166666666666666
ROC AUC: 0.954861111111111
Predictions: [0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 

In [13]:
cross_testing_accuracy(df3, df2, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.5333333333333333
Accuracy with .predict_proba: 0.5333333333333333
ROC AUC: 0.6356225296442688
Predictions: [0 1 1 0 0 1 1 0 1 1 1 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 1 0 1
 0 1 1 0 0 1 0 1 0 0 1 1 1 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0
 1 0 0 0 1 1 1 0 0 1 0 1 1 0 0 0]


Cross-testing with threshold: 5
Accuracy with .score: 0.9444444444444444
Accuracy with .predict_proba: 0.9444444444444444
ROC AUC: 0.9500988142292491
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


Cross-testing with threshold: 10
Accuracy with .score: 0.9666666666666667
Accuracy with .predict_proba: 0.9666666666666667
ROC AUC: 0.9985177865612648
Predictions: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [12]:
cross_testing_accuracy(df3, df4, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.75
Accuracy with .predict_proba: 0.75
ROC AUC: 0.859375
Predictions: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1
 1 0 0 1 1 1 0 0 0 1 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.9791666666666666
Accuracy with .predict_proba: 0.9791666666666666
ROC AUC: 0.9965277777777778
Predictions: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.9375
Accuracy with .predict_proba: 0.9375
ROC AUC: 0.9895833333333333
Predictions: [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 15
Accuracy with .score: 0.9166666666666666
Accuracy with .predict_proba: 0.9166666666666666
ROC AUC: 0.9930555555555555
Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1]


Cros

In [15]:
cross_testing_accuracy(df4, df2, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.6555555555555556
Accuracy with .predict_proba: 0.6555555555555556
ROC AUC: 0.7294960474308301
Predictions: [0 1 1 0 0 0 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 0 1 0 1
 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1
 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0]


Cross-testing with threshold: 5
Accuracy with .score: 0.8555555555555555
Accuracy with .predict_proba: 0.8555555555555555
ROC AUC: 0.9194664031620553
Predictions: [1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]


Cross-testing with threshold: 10
Accuracy with .score: 0.9222222222222223
Accuracy with .predict_proba: 0.9222222222222223
ROC AUC: 0.9893774703557312
Predictions: [1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [16]:
cross_testing_accuracy(df4, df3, genes_rank, ths)

Cross-testing with threshold: 1
Accuracy with .score: 0.7017543859649122
Accuracy with .predict_proba: 0.7017543859649122
ROC AUC: 0.7189039408866995
Predictions: [1 0 0 1 0 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 1 0 1 0 1 1 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0
 0 0 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.8859649122807017
Accuracy with .predict_proba: 0.8859649122807017
ROC AUC: 0.9048645320197043
Predictions: [1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
 0 0 0]


Cross-testing with threshold: 10
Accuracy with .score: 0.9122807017543859
Accuracy with .predict_proba: 0.9122807017543859
ROC AUC: 0.9659790640394088
Predictions: [1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1