In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import metrics

def get_x(df):
    return df.loc[:, df.columns != 'class']
    
def get_y(df):
    return df.loc[:, ['class']].T.values[0]

def get_probs_positive_class(pred):
        positive_probs = []

        for prediction in pred:
            positive_probs.append(prediction[1])
        return positive_probs
    

def load_dataframes(train_path, test_path):
    train_df = pd.read_csv(train_path, index_col=0)
    test_df = pd.read_csv(test_path, index_col=0)
    return train_df, test_df


def cross_testing_accuracy(train_df, test_df, genes_ranking_path, thresholds):
    gdf = pd.read_csv(genes_ranking_path, index_col=0)
    genes = list(gdf.index)
    
    for threshold in thresholds:
        print("Cross-testing with threshold:", threshold)
        #clf = SVC(gamma='auto', probability=True)
        clf = GBC()
        
        tr_top = train_df.loc[:, genes[0:threshold]+['class']]
        ts_top = test_df.loc[:, genes[0:threshold]+['class']]

        X_train = get_x(tr_top)
        y_train = get_y(tr_top)

        clf.fit(X_train, y_train)


        X_test = get_x(ts_top)
        y_test = get_y(ts_top)

        acc_score = clf.score(X_test, y_test)

        pred = clf.predict_proba(X_test)
        y_pred = np.argmax(pred, axis=1)
        pred = get_probs_positive_class(pred)

        roc_auc = metrics.roc_auc_score(np.array(y_test, dtype=int), pred)
        pred_accuracy = metrics.accuracy_score(y_test, y_pred)

        print("Accuracy with .score:", acc_score)
        print("Accuracy with .predict_proba:", pred_accuracy)
        print("ROC AUC:", roc_auc)
        print("\n")

In [2]:
ths = [1, 5, 10, 15, 20, 50, 100]

genes_rank = "/home/colombelli/Documents/experiments/Experiments22_ago/prad/hyb_borda_borda/selection/agg_ranking_th1.csv"
train_df_path = "/home/colombelli/Documents/datasets/assembler/prad/intersect.csv"

test_df1_path = "/home/colombelli/Documents/datasets/cumida/prostate/GSE11682/intersect.csv"
test_df2_path = "/home/colombelli/Documents/datasets/cumida/prostate/GSE46602/intersect.csv"

In [3]:
tr_df, ts_df = load_dataframes(train_df_path, test_df1_path)
ts_df2, _ = load_dataframes(test_df2_path, test_df2_path)

In [4]:
print("Using GSE11682 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE46602 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

Using GSE11682 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.49583333333333335


Cross-testing with threshold: 5
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.4


Cross-testing with threshold: 10
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.4979166666666667


Cross-testing with threshold: 15
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.49375


Cross-testing with threshold: 20
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.5979166666666667


Cross-testing with threshold: 50
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.4666666666666667


Cross-testing with threshold: 100
Accuracy with .score: 0.51612903225

In [7]:
genes_rank = "/home/colombelli/Documents/experiments/Experiments22_ago/prad/hom_gr/selection/agg_ranking_th1.csv"

In [8]:
print("Using GSE11682 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, ths)

print("Using GSE46602 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, ths)

Using GSE11682 as testing dataframe...

Cross-testing with threshold: 1
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.5


Cross-testing with threshold: 5
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.5041666666666667


Cross-testing with threshold: 10
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.63125


Cross-testing with threshold: 15
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.6166666666666667


Cross-testing with threshold: 20
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.48125


Cross-testing with threshold: 50
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.5125


Cross-testing with threshold: 100
Accuracy with .score: 0.5161290322580645
Accuracy with .pre

In [9]:
print("Using GSE11682 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df, genes_rank, [1000, 5000, 10000])

print("Using GSE46602 as testing dataframe...\n")
cross_testing_accuracy(tr_df, ts_df2, genes_rank, [1000, 5000, 10000])

Using GSE11682 as testing dataframe...

Cross-testing with threshold: 1000
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.46875


Cross-testing with threshold: 5000
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.4375


Cross-testing with threshold: 10000
Accuracy with .score: 0.5161290322580645
Accuracy with .predict_proba: 0.5161290322580645
ROC AUC: 0.5625


Using GSE46602 as testing dataframe...

Cross-testing with threshold: 1000
Accuracy with .score: 0.7142857142857143
Accuracy with .predict_proba: 0.7142857142857143
ROC AUC: 0.5581632653061225


Cross-testing with threshold: 5000
Accuracy with .score: 0.7142857142857143
Accuracy with .predict_proba: 0.7142857142857143
ROC AUC: 0.5714285714285714


Cross-testing with threshold: 10000
Accuracy with .score: 0.7142857142857143
Accuracy with .predict_proba: 0.7142857142857143
ROC AUC: 0.6428571428571428


