In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import metrics

def get_x(df):
    return df.loc[:, df.columns != 'class']
    
def get_y(df):
    return df.loc[:, ['class']].T.values[0]

def get_probs_positive_class(pred):
        positive_probs = []

        for prediction in pred:
            positive_probs.append(prediction[1])
        return positive_probs

In [3]:
def load_dataframes(train_path, test_path):
    train_df = pd.read_csv(train_path, index_col=0)
    test_df = pd.read_csv(test_path, index_col=0)
    return train_df, test_df


def cross_testing_accuracy(train_df, test_df, genes_ranking_path, thresholds):
    gdf = pd.read_csv(genes_ranking_path, index_col=0)
    genes = list(gdf.index)
    
    for threshold in thresholds:
        print("Cross-testing with threshold:", threshold)
        #clf = SVC(gamma='auto', probability=True)
        clf = GBC()
        
        tr_top = train_df.loc[:, genes[0:threshold]+['class']]
        ts_top = test_df.loc[:, genes[0:threshold]+['class']]

        X_train = get_x(tr_top)
        y_train = get_y(tr_top)

        clf.fit(X_train, y_train)


        X_test = get_x(ts_top)
        y_test = get_y(ts_top)

        acc_score = clf.score(X_test, y_test)

        pred = clf.predict_proba(X_test)
        y_pred = np.argmax(pred, axis=1)
        pred = get_probs_positive_class(pred)

        roc_auc = metrics.roc_auc_score(np.array(y_test, dtype=int), pred)
        pred_accuracy = metrics.accuracy_score(y_test, y_pred)

        print("Accuracy with .score:", acc_score)
        print("Accuracy with .predict_proba:", pred_accuracy)
        print("ROC AUC:", roc_auc)
        print("Predictions:", y_pred)
        print("\n")

In [8]:
ths = [1, 5, 10, 15, 20, 50, 100]

genes_rank = "/home/colombelli/Documents/experiments/Experiments20_ago/hyb_borda_borda/selection/agg_ranking_th1.csv"
train_df_path = "/home/colombelli/Documents/datasets/brca_rnaseq_genes_intersec.csv"

test_df1_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE42568.csv"
test_df2_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE22820.csv"
test_df3_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE70947.csv"

In [9]:
tr_df, ts_df = load_dataframes(train_df_path, test_df1_path)
ts_df2, ts_df3 = load_dataframes(test_df2_path, test_df3_path)

In [10]:
tr_df = tr_df.sample(frac=1)
tr_df = pd.concat([tr_df.loc[tr_df['class']==0],
            tr_df.loc[tr_df['class']==1][0:113]]).sample(frac = 1)
tr_df

Unnamed: 0,GLIS1,PXMP2,ATP5SL,RHBDF1,ARL11,ZFP37,BTN3A2,LGI3,TYRO3,SDF4,...,SLC4A4,FOXC2,TNFSF11,SPRED2,LMNA,ACAD8,EPHA10,OPLAH,C1S,class
TCGA-E9-A1NA-11A-33R-A144-07,0.909886,-0.596612,-0.680262,-0.873044,-0.559948,0.599084,-0.894389,0.639151,2.123649,0.294095,...,1.281010,0.631563,-0.831493,-0.557531,1.001034,0.444903,-0.569650,-0.018641,1.471254,0
TCGA-E9-A1N3-01A-12R-A157-07,-0.951231,-0.741448,-0.101655,-0.467864,0.208595,-1.974860,-0.651880,0.796979,-0.858036,0.512187,...,2.342268,-1.359574,-0.719864,-0.555116,-1.349297,1.659026,2.035536,-0.703892,-1.418824,1
TCGA-EW-A1OW-01A-21R-A144-07,0.255393,0.424434,-1.859871,-0.257533,0.200137,0.419910,1.582945,-1.302306,0.931952,-1.001034,...,-0.375113,1.597439,0.352997,-0.794145,1.105055,0.390703,-1.430141,-1.151750,1.299911,1
TCGA-BH-A0B3-11B-21R-A089-07,0.114153,-0.168543,-1.035650,-0.342003,-0.270396,2.035536,1.196855,0.672468,0.791318,-0.703892,...,1.235638,0.941548,-1.042722,-0.291934,-0.576962,0.870027,-0.080858,-0.846166,0.891315,0
TCGA-EW-A1P4-01A-21R-A144-07,-0.545491,0.900562,-0.746898,-1.683815,-0.062171,-0.507476,2.374835,0.500430,0.961002,1.258003,...,0.545491,1.575820,0.746898,-1.514990,0.352997,-0.225555,-0.802665,-1.453336,2.592532,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-BH-A2L8-01A-11R-A18M-07,1.304709,-0.097493,0.143388,-1.477332,0.097493,0.664715,1.597439,0.451766,-0.876069,0.141296,...,0.581854,0.240447,0.994253,-0.261816,-0.080858,0.903661,0.024855,-0.268249,0.521644,1
TCGA-S3-AA17-01A-11R-A41B-07,-0.867018,-0.041433,0.709196,1.078885,0.840276,-0.101655,1.231238,-1.302306,-0.068398,-0.076704,...,-0.481759,-0.335427,0.221309,-0.093332,0.276844,-0.064247,1.837363,0.766147,0.018641,1
TCGA-A7-A0CH-11A-32R-A089-07,0.206479,0.089172,0.328865,-0.524015,-0.331050,0.095412,0.763380,1.495899,1.974860,1.139845,...,0.897471,0.634088,-0.521644,-0.375113,1.128098,0.780070,-0.757863,0.221309,1.528029,0
TCGA-E9-A1R7-11A-42R-A14M-07,-0.322316,-1.231238,-1.534645,-1.794927,-0.147574,1.281010,-0.183264,0.799819,2.068891,-0.834414,...,1.650983,0.574521,-0.512187,-0.189585,-0.130846,0.618994,-1.521478,-1.049846,2.051931,0


In [11]:
print("Using GSE42568 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE22820 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

print("Using GSE70947 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, ths)

Using GSE42568 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.7931034482758621
Accuracy with .predict_proba: 0.7931034482758621
ROC AUC: 0.8323432343234324
Predictions: [0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1]


Cross-testing with threshold: 5
Accuracy with .score: 0.8362068965517241
Accuracy with .predict_proba: 0.8362068965517241
ROC AUC: 0.7844884488448844
Predictions: [1 0 1 0 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1]


Cross-testing with threshold: 10
Accuracy with .score: 0.8448275862068966
Accuracy with .predict_proba: 0.8448275862068966
ROC AUC: 0.8739273927392739
Predi

Accuracy with .score: 0.5882352941176471
Accuracy with .predict_proba: 0.5882352941176471
ROC AUC: 0.5962017434620175
Predictions: [0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 1 0 0 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


Cross-testing with threshold: 20
Accuracy with .score: 0.5951557093425606
Accuracy with .predict_proba: 0.5951557093425606
ROC AUC: 0.598620557524667
Predictions: [0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1
 0 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1