### Imports and definition of a function used to calculate AUC score

In [1]:
import numpy as np
import pandas as pd

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

import pickle
import sys

from sklearn import metrics
from sklearn.svm import SVC

def get_probs_positive_class(pred):
        positive_probs = []

        for prediction in pred:
            positive_probs.append(prediction[1])
        return positive_probs

### Loading full dataset without the IQR pre-processing step and the fold sampling binary

In [2]:
dataset_path = "/home/colombelli/Documents/research_proc_ds/BRCA/processedBRCA.rds"
read_RDS = robjects.r['readRDS']
df = read_RDS(dataset_path)

with localconverter(robjects.default_converter + pandas2ri.converter):
    pydf = robjects.conversion.rpy2py(df)

    
fold_path = "/home/colombelli/Documents/brca_new_sampling/fold_sampling.pkl"
with open(fold_path, 'rb') as file:
    folds_sampling = pickle.load(file)

### Selecting fold2 (arbitrary) for evaluate its predictive performance

In [3]:
fold2 = folds_sampling[1]

In [4]:
training = pydf.loc[fold2[0]]
testing = pydf.loc[fold2[1]]

### Loading the ranking obtained without the IQR pre-processing step (also no OneR in the Hyb Ens)

In [58]:
ranking_output_path = "/home/colombelli/Desktop/BRCA_no_IQR/fold_1/agg_ranking.rds"

read_RDS = robjects.r['readRDS']
ranking = read_RDS(ranking_output_path)

with localconverter(robjects.default_converter + pandas2ri.converter):
    pyrnk = robjects.conversion.rpy2py(ranking)

## ----------------------------------------------------------------------------------------

# First Experiment (10 runs): 5 random genes

In [59]:
import random

In [73]:
for i in range(1,10):
    print("Iteration:", i)
    
    genes = list(pyrnk.index.values)
    genes = random.sample(genes, 5)
    print("Sampled Genes:", genes)

    training_x = training.loc[:, genes]
    testing_x = testing.loc[:, genes]

    training_y = training.loc[:, ['class']].T.values[0]
    testing_y = testing.loc[:, ['class']].T.values[0]

    clf = SVC(gamma='auto', probability=True)
    clf.fit(training_x, training_y)

    print("Accuracy:")
    print(clf.score(testing_x, testing_y))
    print("ROC curve's AUC score:")
    pred = clf.predict_proba(testing_x)
    pred = get_probs_positive_class(pred)
    print(metrics.roc_auc_score(np.array(testing_y, dtype=int), pred))
    print("\n\n")

Iteration: 1
Sampled Genes: ['SOX13', 'OR2T27', 'OR5AR1', 'NKIRAS1', 'KIAA0664L3']
Accuracy:
0.7004048582995951
ROC curve's AUC score:
0.8732531055900621



Iteration: 2
Sampled Genes: ['BEX5', 'LOC728190', 'TTTY18', 'FAM183A', 'OR4F5']
Accuracy:
0.7530364372469636
ROC curve's AUC score:
0.8992624223602484



Iteration: 3
Sampled Genes: ['ROBO1', 'FAM82A1', 'UBR7', 'SCRIB', 'OR5T2']
Accuracy:
0.8785425101214575
ROC curve's AUC score:
0.9431288819875777



Iteration: 4
Sampled Genes: ['CXorf40B', 'RPL35A', 'TRPV4', 'MBL1P', 'MIR1276']
Accuracy:
0.7894736842105263
ROC curve's AUC score:
0.9138198757763976



Iteration: 5
Sampled Genes: ['PLGLB1', 'STAU2', 'TBCB', 'PRAMEF21', 'PHPT1']
Accuracy:
0.8582995951417004
ROC curve's AUC score:
0.9196428571428572



Iteration: 6
Sampled Genes: ['MCF2', 'RTN2', 'WSB2', 'COG5', 'HSD11B1']
Accuracy:
0.7651821862348178
ROC curve's AUC score:
0.9281832298136645



Iteration: 7
Sampled Genes: ['SERPINA1', 'ANKFY1', 'MRPL49', 'QKI', 'PSMB6']
Accuracy:
0.

# Second Experiment: Select another seed

This was already done, the above loaded fold sampling binary was generated with another seed.

# Third Experiment: classification with Random Forest algorithm

In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

best_genes = list(pyrnk.index.values)
best_genes = best_genes[0:5]
print("Selected Genes:", best_genes)

training_x = training.loc[:, best_genes]
testing_x = testing.loc[:, best_genes]

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(training_x, training_y)

print("Accuracy:", clf.score(testing_x, testing_y))
pred = clf.predict_proba(testing_x)
pred = get_probs_positive_class(pred)
print("AUC score:", metrics.roc_auc_score(np.array(testing_y, dtype=int), pred))

Selected Genes: ['MIR4508', 'FIGF', 'IBSP', 'CA4', 'LRRC3B']
Accuracy: 0.9919028340080972
AUC score: 1.0


### 3.1 Random Forest + Random Genes (10 runs)

In [81]:
for i in range(1,10):
    print("Iteration:", i)
    
    genes = list(pyrnk.index.values)
    genes = random.sample(genes, 5)
    print("Sampled Genes:", genes)

    training_x = training.loc[:, genes]
    testing_x = testing.loc[:, genes]

    training_y = training.loc[:, ['class']].T.values[0]
    testing_y = testing.loc[:, ['class']].T.values[0]

    clf = RandomForestClassifier(max_depth=2)
    clf.fit(training_x, training_y)

    print("Accuracy:")
    print(clf.score(testing_x, testing_y))
    print("ROC curve's AUC score:")
    pred = clf.predict_proba(testing_x)
    pred = get_probs_positive_class(pred)
    print(metrics.roc_auc_score(np.array(testing_y, dtype=int), pred))
    print("\n\n")

Iteration: 1
Sampled Genes: ['CTSA', 'IGFN1', 'ENKUR', 'MIR591', 'FAM136A']
Accuracy:
0.7408906882591093
ROC curve's AUC score:
0.9104231366459627



Iteration: 2
Sampled Genes: ['WDR5', 'LOC283104', 'ZNF44', 'LOC647859', 'FOXI3']
Accuracy:
0.8016194331983806
ROC curve's AUC score:
0.9153726708074534



Iteration: 3
Sampled Genes: ['LOC400238', 'SAMD11', 'IGFBPL1', 'FMO3', 'ZGLP1']
Accuracy:
0.728744939271255
ROC curve's AUC score:
0.923233695652174



Iteration: 4
Sampled Genes: ['YME1L1', 'ZNF815', 'TIMM8A', 'TM7SF3', 'RBM15']
Accuracy:
0.6072874493927125
ROC curve's AUC score:
0.7943517080745341



Iteration: 5
Sampled Genes: ['CHD7', 'C17orf78', 'USP33', 'NEGR1-IT1', 'FMO4']
Accuracy:
0.8421052631578947
ROC curve's AUC score:
0.8693711180124224



Iteration: 6
Sampled Genes: ['ZCCHC16', 'TPT1-AS1', 'SNORA79', 'GUSBP11', 'TTTY17B']
Accuracy:
0.8137651821862348
ROC curve's AUC score:
0.8402562111801243



Iteration: 7
Sampled Genes: ['LOC440434', 'CTU2', 'HR', 'LACC1', 'PGC']
Accurac

# Fourth Experiment: classification with Naive Bayes algorithm

In [79]:
from sklearn.naive_bayes import GaussianNB

best_genes = list(pyrnk.index.values)
best_genes = best_genes[0:5]
print("Selected Genes:", best_genes)

training_x = training.loc[:, best_genes]
testing_x = testing.loc[:, best_genes]

gnb = GaussianNB()
gnb.fit(training_x, training_y)

print("Accuracy:", gnb.score(testing_x, testing_y))
pred = gnb.predict_proba(testing_x)
pred = get_probs_positive_class(pred)
print("AUC score:", metrics.roc_auc_score(np.array(testing_y, dtype=int), pred))

Selected Genes: ['MIR4508', 'FIGF', 'IBSP', 'CA4', 'LRRC3B']
Accuracy: 0.9838056680161943
AUC score: 0.9965062111801243


### 4.1 Naive Bayes + Random Genes (10 runs)

In [83]:
for i in range(1,10):
    print("Iteration:", i)
    
    genes = list(pyrnk.index.values)
    genes = random.sample(genes, 5)
    print("Sampled Genes:", genes)

    training_x = training.loc[:, genes]
    testing_x = testing.loc[:, genes]

    training_y = training.loc[:, ['class']].T.values[0]
    testing_y = testing.loc[:, ['class']].T.values[0]

    gnb = GaussianNB()
    gnb.fit(training_x, training_y)

    print("Accuracy:", gnb.score(testing_x, testing_y))
    pred = gnb.predict_proba(testing_x)
    pred = get_probs_positive_class(pred)
    print("AUC score:", metrics.roc_auc_score(np.array(testing_y, dtype=int), pred))
    print("\n\n")

Iteration: 1
Sampled Genes: ['NUP85', 'LOC284648', 'THBD', 'OR10R2', 'MIR190B']
Accuracy: 0.8866396761133604
AUC score: 0.7979425465838508



Iteration: 2
Sampled Genes: ['TSSK2', 'LOC100506650', 'HDHD1', 'LOC645752', 'HIST1H3E']
Accuracy: 0.7692307692307693
AUC score: 0.7688276397515528



Iteration: 3
Sampled Genes: ['MIR548AN', 'SCG3', 'ALX4', 'ZNF395', 'TGFBR1']
Accuracy: 0.8259109311740891
AUC score: 0.8934394409937888



Iteration: 4
Sampled Genes: ['DKK1', 'MDM1', 'SNAR-A7', 'SNHG6', 'PCDH15']
Accuracy: 0.6153846153846154
AUC score: 0.8049301242236025



Iteration: 5
Sampled Genes: ['MRPL53', 'ASB14', 'PCDHA10', 'COLEC11', 'GPR78']
Accuracy: 0.7449392712550608
AUC score: 0.8786878881987576



Iteration: 6
Sampled Genes: ['IER5L', 'RAB26', 'TNNI1', 'POGLUT1', 'PHC1']
Accuracy: 0.8663967611336032
AUC score: 0.9144021739130435



Iteration: 7
Sampled Genes: ['C3orf62', 'SLCO1B1', 'CLEC18A', 'PDGFD', 'TXNDC2']
Accuracy: 0.9271255060728745
AUC score: 0.9180900621118012



Iteration: 

### And just to remember and compare, the performance with SVM over the 5 best genes

In [80]:
best_genes = list(pyrnk.index.values)
best_genes = best_genes[0:5]
print("Selected Genes:", best_genes)

training_x = training.loc[:, best_genes]
testing_x = testing.loc[:, best_genes]

clf = SVC(gamma='auto', probability=True)
clf.fit(training_x, training_y)

print("Accuracy", clf.score(testing_x, testing_y))
pred = clf.predict_proba(testing_x)
pred = get_probs_positive_class(pred)
print("AUC score:", metrics.roc_auc_score(np.array(testing_y, dtype=int), pred))

Selected Genes: ['MIR4508', 'FIGF', 'IBSP', 'CA4', 'LRRC3B']
Accuracy 1.0
AUC score: 1.0
