In [9]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn import metrics

def get_x(df):
    return df.loc[:, df.columns != 'class']
    
def get_y(df):
    return df.loc[:, ['class']].T.values[0]

def get_probs_positive_class(pred):
        positive_probs = []

        for prediction in pred:
            positive_probs.append(prediction[1])
        return positive_probs

In [10]:
def load_dataframes(train_path, test_path):
    train_df = pd.read_csv(train_path, index_col=0)
    test_df = pd.read_csv(test_path, index_col=0)
    return train_df, test_df


def cross_testing_accuracy(train_df, test_df, genes_ranking_path, threshold):
    gdf = pd.read_csv(genes_ranking_path, index_col=0)
    genes = list(gdf.index)
    
    clf = SVC(gamma='auto', probability=True)

    tr_top = train_df.loc[:, genes[0:threshold]+['class']]
    ts_top = test_df.loc[:, genes[0:threshold]+['class']]

    X_train = get_x(tr_top)
    y_train = get_y(tr_top)
    
    clf.fit(X_train, y_train)
    
    
    X_test = get_x(ts_top)
    y_test = get_y(ts_top)
    
    acc_score = clf.score(X_test, y_test)
    
    pred = clf.predict_proba(X_test)
    y_pred = np.argmax(pred, axis=1)
    pred = self.__get_probs_positive_class(pred)

    roc_auc = metrics.roc_auc_score(np.array(X_test, dtype=int), pred)
    pred_accuracy = metrics.accuracy_score(X_test, y_pred)
    
    print("Accuracy with .score:", acc_score)
    print("Accuracy with .predict_proba:", pred_accuracy)
    print("ROC AUC:", roc_auc)

In [15]:
genes_rank = "/home/colombelli/Documents/Experiments05_ago/intersect/hyb_borda_borda/selection/agg_ranking_th1.csv"
train_df_path = "/home/colombelli/Documents/datasets/brca_rnaseq_genes_intersec.csv"

test_df1_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE42568.csv"
test_df2_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE22820.csv"
test_df3_path = "/home/colombelli/Documents/datasets/cumida/intersect_genes/GSE70947.csv"

In [12]:
tr_df, ts_df = load_dataframes(train_df_path, test_df1_path)

In [16]:
ts_df2, ts_df3 = load_dataframes(test_df2_path, test_df3_path)