# Cancer experiment

In [1]:
def wmean_qc_coef(wmean, threshold, extended_list, true_genes):
    wmean_names = wmean['gene_symbol'].values
    pred_y = np.array(wmean['Probability'].values >= threshold)
    new_y = np.array(list(map(lambda gene: (gene in extended_list['gene_symbol'].values) & \
                        (gene not in true_genes['gene_symbol'].values), wmean_names))) #wmean_names
    
    if np.sum(pred_y) == 0:
        return 0
    Pr = np.sum(pred_y) / pred_y.size
    TP = np.sum(new_y & pred_y) # TP
    FN = np.sum(new_y) - TP # FN
    recall = TP/(TP+FN)
    coef = recall**2/Pr
    norm_coef = coef * np.sum(new_y) / new_y.size
    return coef # coef
    
def perform_qc(wmean, extended_list, true_genes):
    threshold_range = wmean.Probability.values #[0.01, 0.1] + [i for i in range(1, 98)]
    qc_range = [wmean_qc_coef(wmean, threshold, extended_list, true_genes) for threshold in threshold_range]
    #print(qc_range)
    optimal_thr = [threshold_range[i] for i, j in enumerate(qc_range) if j == max(qc_range)][0]
    qc_coef = wmean_qc_coef(wmean, optimal_thr, extended_list, true_genes)
    print('qc_coef:', qc_coef)
    print('opt_threshols:', optimal_thr)
    return qc_range

In [2]:
from sklearn.cluster import FeatureAgglomeration
from sklearn.preprocessing import RobustScaler
import pandas as pd
from ensemble import *

In [10]:
ntrain = [round(212*dob/100) for dob in range(2,20,2)]
ntrain

[4, 8, 13, 17, 21, 25, 30, 34, 38]

In [6]:
y.sum()

2

In [12]:
import math
from sklearn.preprocessing import StandardScaler

roc_dict = dict()
f_dict = dict()
pu_dict = dict()


for ntr in  ntrain:
    
    roc_store = list()
    f1_store = list()
    pu_store = list()
    
    print(f'NTR ==== {ntr}')
    for i in range(10):
        X = pd.read_csv('/home/nikita/Downloads/cancer.csv')
        y = X.diagnosis
        del X['diagnosis']
        del X['Unnamed: 32']
        del X['id']
        
        scaler = StandardScaler()
        scaler.fit(X)
        X = pd.DataFrame(scaler.transform(X))
        y = y.map(lambda x: 1 if x == 'M' else 0)
        hidden_size = y.sum() - ntr
        y_orig = y.copy()
        y.loc[
                np.random.choice(
                    y[y == 1].index, 
                    replace = False, 
                    size = hidden_size
                )
            ] = 0
        val_y = y[y == 1].copy()
        y.loc[
                np.random.choice(
                    y[y == 1].index, 
                    replace = False, 
                    size = round(ntr*1/3)
                )
            ] = 0
        als = pd.DataFrame({'gene_symbol':list(set(y_orig.index-y.index))})
        X['gene_symbol'] = X.index
        ens = EnsembleClassifier(X, y, MODELS, als, n_bootstrap=60)
        ens.run_estimators()
        wmean = ens.best_scored_proba()
        wmean['Probability'] = wmean.Probability.apply(lambda x: 100 if x==-1 else x)
        extended_list = pd.DataFrame({'gene_symbol':y_orig[y_orig==1].index.values})
        true_genes = pd.DataFrame({'gene_symbol':val_y.index.values})
        qc = max(perform_qc(wmean, extended_list, true_genes))
        pu_store.append(qc)
    pu_dict[str(f'{ntr}')] = pu_store

NTR ==== 4


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 21.884615384615383

Support Vector Machine	
PU-score: 1.2262931034482758

ADABoosting	
PU-score: 94.83333333333333

Random Forest	
PU-score: 40.642857142857146

Decision Tree	
PU-score: 142.25

Finding best combination	
Best combination is: ['Logistic regression' 'ADABoosting' 'Decision Tree']
PU-score: 142.25
qc_coef: 2.359581969182711
opt_threshols: 29.1011257468666


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 7.586666666666667

Support Vector Machine	
PU-score: 1.0053003533568905

ADABoosting	
PU-score: 12.369565217391305

Random Forest	
PU-score: 10.537037037037038

Decision Tree	
PU-score: 10.735849056603774

Finding best combination	
Best combination is: ['ADABoosting' 'Random Forest']
PU-score: 12.931818181818182
qc_coef: 2.267500721604849
opt_threshols: 45.35169491525423


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 16.257142857142856

Support Vector Machine	
PU-score: 1.7836990595611286

ADABoosting	
PU-score: 17.78125

Random Forest	
PU-score: 142.25

Decision Tree	
PU-score: 47.416666666666664

Finding best combination	
Best combination is: ['Random Forest']
PU-score: 142.25
qc_coef: 1.7678482351726983
opt_threshols: 46.10169491525423


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 18.966666666666665

Support Vector Machine	
PU-score: 1.6028169014084506

ADABoosting	
PU-score: 47.416666666666664

Random Forest	
PU-score: 35.5625

Decision Tree	
PU-score: 81.28571428571429

Finding best combination	
Best combination is: ['Logistic regression' 'Support Vector Machine' 'ADABoosting'
 'Decision Tree']
PU-score: 142.25
qc_coef: 2.0499160309767728
opt_threshols: 38.1662410576944


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 35.5625

Support Vector Machine	
PU-score: 1.182952182952183

ADABoosting	
PU-score: 47.416666666666664

Random Forest	
PU-score: 284.5

Decision Tree	
PU-score: 113.79999999999998

Finding best combination	
Best combination is: ['Random Forest']
PU-score: 284.5
qc_coef: 2.117573931974964
opt_threshols: 44.16666666666665


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 7.794520547945206

Support Vector Machine	
PU-score: 1.6884272997032643

ADABoosting	
PU-score: 8.128571428571428

Random Forest	
PU-score: 14.58974358974359

Decision Tree	
PU-score: 9.483333333333333

Finding best combination	
Best combination is: ['Logistic regression' 'Support Vector Machine' 'Random Forest']
PU-score: 14.58974358974359
qc_coef: 2.2095613721214677
opt_threshols: 42.75449004206667


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 4.905172413793103

Support Vector Machine	
PU-score: 1.6637426900584795

ADABoosting	
PU-score: 9.17741935483871

Random Forest	
PU-score: 9.17741935483871

Decision Tree	
PU-score: 3.5123456790123457

Finding best combination	
Best combination is: ['Support Vector Machine' 'ADABoosting' 'Random Forest']
PU-score: 11.379999999999999
qc_coef: 2.0403591589938177
opt_threshols: 44.05307664744655


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 5.317757009345795

Support Vector Machine	
PU-score: 2.495614035087719

ADABoosting	
PU-score: 8.014084507042254

Random Forest	
PU-score: 5.927083333333333

Decision Tree	
PU-score: 7.586666666666667

Finding best combination	
Best combination is: ['Support Vector Machine' 'ADABoosting']
PU-score: 8.246376811594203
qc_coef: 2.203022028099498
opt_threshols: 40.42156704104475


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 5.126126126126126

Support Vector Machine	
PU-score: 1.5761772853185596

ADABoosting	
PU-score: 3.7434210526315788

Random Forest	
PU-score: 8.492537313432836

Decision Tree	
PU-score: 5.317757009345795

Finding best combination	
Best combination is: ['Random Forest']
PU-score: 8.492537313432836
qc_coef: 2.357055244435052
opt_threshols: 47.83333333333334


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 71.125

Support Vector Machine	
PU-score: 1.841423948220065

ADABoosting	
PU-score: 21.074074074074076

Random Forest	
PU-score: 51.72727272727273

Decision Tree	
PU-score: 63.22222222222222

Finding best combination	
Best combination is: ['Logistic regression' 'Random Forest']
PU-score: 284.5
qc_coef: 1.8567008737928437
opt_threshols: 43.9330437778418
NTR ==== 8


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 11.612244897959183

Support Vector Machine	
PU-score: 1.841423948220065

ADABoosting	
PU-score: 5.633663366336634

Random Forest	
PU-score: 5.633663366336634

Decision Tree	
PU-score: 5.989473684210527

Finding best combination	
Best combination is: ['Logistic regression' 'Support Vector Machine']
PU-score: 13.232558139534884
qc_coef: 2.376975708994801
opt_threshols: 42.0248173048091


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 2.6966824644549763

Support Vector Machine	
PU-score: 1.6985074626865673

ADABoosting	
PU-score: 2.266932270916335

Random Forest	
PU-score: 3.897260273972603

Decision Tree	
PU-score: 1.9353741496598638

Finding best combination	
Best combination is: ['Random Forest']
PU-score: 3.897260273972603
qc_coef: 1.9923241731713206
opt_threshols: 50.16949152542373


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 2.1390977443609023

Support Vector Machine	
PU-score: 1.2902494331065761

ADABoosting	
PU-score: 2.076642335766423

Random Forest	
PU-score: 3.5341614906832297

Decision Tree	
PU-score: 2.2760000000000002

Finding best combination	
Best combination is: ['Random Forest']
PU-score: 3.5341614906832297
qc_coef: 1.9996215397923878
opt_threshols: 49.33333333333332


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 1.9030100334448161

Support Vector Machine	
PU-score: 1.9222972972972971

ADABoosting	
PU-score: 1.6540697674418605

Random Forest	
PU-score: 2.2760000000000002

Decision Tree	
PU-score: 1.9825783972125435

Finding best combination	
Best combination is: ['Support Vector Machine' 'Random Forest']
PU-score: 2.3907563025210083
qc_coef: 2.117933859185673
opt_threshols: 46.57243487602551


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 2.574660633484163

Support Vector Machine	
PU-score: 1.0252252252252252

ADABoosting	
PU-score: 2.7756097560975608

Random Forest	
PU-score: 2.610091743119266

Decision Tree	
PU-score: 2.463203463203463

Finding best combination	
Best combination is: ['ADABoosting']
PU-score: 2.7756097560975608
qc_coef: 1.6492128372323853
opt_threshols: 35.59322033898305


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 17.78125

Support Vector Machine	
PU-score: 2.9790575916230364

ADABoosting	
PU-score: 43.76923076923077

Random Forest	
PU-score: 51.72727272727273

Decision Tree	
PU-score: 25.863636363636363

Finding best combination	
Best combination is: ['Logistic regression' 'Support Vector Machine' 'ADABoosting'
 'Random Forest']
PU-score: 113.79999999999998
qc_coef: 2.076230364369519
opt_threshols: 32.09759584252236


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 29.94736842105263

Support Vector Machine	
PU-score: 2.722488038277512

ADABoosting	
PU-score: 5.471153846153846

Random Forest	
PU-score: 5.126126126126126

Decision Tree	
PU-score: 3.6709677419354843

Finding best combination	
Best combination is: ['Logistic regression']
PU-score: 29.94736842105263
qc_coef: 1.7791653084933885
opt_threshols: 59.591070621740165


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 14.973684210526315

Support Vector Machine	
PU-score: 2.4008438818565403

ADABoosting	
PU-score: 81.28571428571429

Random Forest	
PU-score: 16.735294117647058

Decision Tree	
PU-score: 25.863636363636363

Finding best combination	
Best combination is: ['ADABoosting']
PU-score: 81.28571428571429
qc_coef: 2.2341965212876427
opt_threshols: 15.254237288135604


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 5.268518518518519

Support Vector Machine	
PU-score: 2.3036437246963564

ADABoosting	
PU-score: 4.741666666666666

Random Forest	
PU-score: 8.890625

Decision Tree	
PU-score: 5.6899999999999995

Finding best combination	
Best combination is: ['Support Vector Machine' 'Random Forest']
PU-score: 11.156862745098039
qc_coef: 2.1878963607625663
opt_threshols: 45.903261566059506


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	
PU-score: 3.897260273972603

Support Vector Machine	
PU-score: 2.1471698113207545

ADABoosting	
PU-score: 2.762135922330097

Random Forest	
PU-score: 3.161111111111111

Decision Tree	
PU-score: 3.2514285714285713

Finding best combination	
Best combination is: ['Logistic regression' 'Support Vector Machine']
PU-score: 4.1838235294117645
qc_coef: 1.9521209514908062
opt_threshols: 49.63003168415709
NTR ==== 13


StandardScaler(copy=True, with_mean=True, with_std=True)

Logistic regression	


  recall = tp/(tp+fn)


PU-score: nan

Support Vector Machine	


  recall = tp/(tp+fn)


PU-score: nan

ADABoosting	


  recall = tp/(tp+fn)


PU-score: nan

Random Forest	


  recall = tp/(tp+fn)


PU-score: nan

Decision Tree	
PU-score: nan

Finding best combination	


  recall = tp/(tp+fn)


UnboundLocalError: local variable 'index_max' referenced before assignment

In [10]:
pu_dict.keys()

dict_keys(['3', '5', '8', '11', '14', '17'])

In [27]:
3 in extended_list['gene_symbol'].values

True

In [25]:
wmean['gene_symbol']

3        3
9        9
78      78
5        5
352    352
      ... 
351    351
215    215
487    487
214    214
7        7
Name: gene_symbol, Length: 569, dtype: int64

In [36]:
val_y

30     1
54     1
100    1
138    1
215    1
282    1
414    1
441    1
533    1
535    1
Name: diagnosis, dtype: int64