In [35]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import metrics

def get_x(df):
    return df.loc[:, df.columns != 'class']
    
def get_y(df):
    return df.loc[:, ['class']].T.values[0]

def get_probs_positive_class(pred):
        positive_probs = []

        for prediction in pred:
            positive_probs.append(prediction[1])
        return positive_probs

In [37]:
def load_dataframes(train_path, test_path):
    train_df = pd.read_csv(train_path, index_col=0)
    test_df = pd.read_csv(test_path, index_col=0)
    return train_df, test_df


def cross_testing_accuracy(train_df, test_df, genes_ranking_path, thresholds):
    gdf = pd.read_csv(genes_ranking_path, index_col=0)
    genes = list(gdf.index)
    
    for threshold in thresholds:
        print("Cross-testing with threshold:", threshold)
        #clf = SVC(gamma='auto', probability=True)
        clf = GBC()
        
        tr_top = train_df.loc[:, genes[0:threshold]+['class']]
        ts_top = test_df.loc[:, genes[0:threshold]+['class']]

        X_train = get_x(tr_top)
        y_train = get_y(tr_top)

        clf.fit(X_train, y_train)


        X_test = get_x(ts_top)
        y_test = get_y(ts_top)

        acc_score = clf.score(X_test, y_test)

        pred = clf.predict_proba(X_test)
        y_pred = np.argmax(pred, axis=1)
        pred = get_probs_positive_class(pred)

        roc_auc = metrics.roc_auc_score(np.array(y_test, dtype=int), pred)
        pred_accuracy = metrics.accuracy_score(y_test, y_pred)

        print("Accuracy with .score:", acc_score)
        print("Accuracy with .predict_proba:", pred_accuracy)
        print("ROC AUC:", roc_auc)
        print("\n")

In [15]:
genes_rank = "/home/colombelli/Documents/Experiments05_ago/intersect/hyb_borda_borda/selection/agg_ranking_th1.csv"
train_df_path = "/home/colombelli/Documents/datasets/brca_rnaseq_genes_intersec.csv"

test_df1_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE42568.csv"
test_df2_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE22820.csv"
test_df3_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE70947.csv"

In [12]:
tr_df, ts_df = load_dataframes(train_df_path, test_df1_path)

In [16]:
ts_df2, ts_df3 = load_dataframes(test_df2_path, test_df3_path)

In [31]:
ths = [1, 5, 10, 15, 20, 50, 100]

In [36]:
print("Using GSE42568 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE22820 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

print("Using GSE70947 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, ths)

Using GSE42568 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.8448275862068966
Accuracy with .predict_proba: 0.853448275862069
ROC AUC: 0.5438943894389439


Cross-testing with threshold: 5
Accuracy with .score: 0.8879310344827587
Accuracy with .predict_proba: 0.8879310344827587
ROC AUC: 0.9544554455445545


Cross-testing with threshold: 10
Accuracy with .score: 0.896551724137931
Accuracy with .predict_proba: 0.8879310344827587
ROC AUC: 0.9036303630363036


Cross-testing with threshold: 15
Accuracy with .score: 0.8879310344827587
Accuracy with .predict_proba: 0.8793103448275862
ROC AUC: 0.9841584158415841


Cross-testing with threshold: 20
Accuracy with .score: 0.896551724137931
Accuracy with .predict_proba: 0.8879310344827587
ROC AUC: 0.9828382838283828


Cross-testing with threshold: 50
Accuracy with .score: 0.8879310344827587
Accuracy with .predict_proba: 0.8706896551724138
ROC AUC: 1.0


Cross-testing with threshold: 100
Accuracy with .score: 0.8879

In [38]:
print("Using GSE42568 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE22820 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

print("Using GSE70947 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df3, genes_rank, ths)

Using GSE42568 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.853448275862069
Accuracy with .predict_proba: 0.853448275862069
ROC AUC: 0.8554455445544555


Cross-testing with threshold: 5
Accuracy with .score: 0.9051724137931034
Accuracy with .predict_proba: 0.9051724137931034
ROC AUC: 0.961056105610561


Cross-testing with threshold: 10
Accuracy with .score: 0.8706896551724138
Accuracy with .predict_proba: 0.8706896551724138
ROC AUC: 0.9590759075907591


Cross-testing with threshold: 15
Accuracy with .score: 0.9137931034482759
Accuracy with .predict_proba: 0.9137931034482759
ROC AUC: 0.9696369636963696


Cross-testing with threshold: 20
Accuracy with .score: 0.9396551724137931
Accuracy with .predict_proba: 0.9396551724137931
ROC AUC: 0.9590759075907591


Cross-testing with threshold: 50
Accuracy with .score: 0.9396551724137931
Accuracy with .predict_proba: 0.9396551724137931
ROC AUC: 0.9405940594059407


Cross-testing with threshold: 100
Accuracy with