In [56]:
import pandas as pd
import numpy as np

def wmean_qc_coef(wmean, threshold, extended_list):
    wmean_names = wmean['gene_symbol'].values
    pred_y = np.array(wmean['Probability'].values >= threshold)
    new_y = np.array(list(map(lambda gene: gene in extended_list['gene_symbol'].values, wmean_names))) #wmean_names
    if np.sum(pred_y) == 0:
        return 0
    Pr = np.sum(pred_y) / pred_y.size
    TP = np.sum(new_y & pred_y) # TP
    FN = np.sum(new_y) - TP # FN
    recall = TP/(TP+FN)
    coef = recall**2/Pr
    norm_coef = coef * np.sum(new_y) / new_y.size
    return coef # coef
    
def perform_qc(wmean, extended_list):
    threshold_range = wmean.Probability.values
    qc_range = [wmean_qc_coef(wmean, threshold, extended_list) for threshold in threshold_range]
    #print(qc_range)
    optimal_thr = [threshold_range[i] for i, j in enumerate(qc_range) if j == max(qc_range)][0]
    qc_coef = wmean_qc_coef(wmean, optimal_thr, extended_list)
    print('qc_coef:', qc_coef)
    print('opt_threshols:', optimal_thr)
    return qc_range

In [59]:
LR = pd.read_csv('/home/nikita/Desktop/illustration/scz2/LR_predictions.tsv', sep='\t')
SVM = pd.read_csv('/home/nikita/Desktop/illustration/scz2/SVM_predictions.tsv', sep='\t')
TOPP = pd.read_csv('/home/nikita/Desktop/TOPPGENE/ToppGeneDataIBD.csv')
GPR = pd.read_csv('/home/nikita/Desktop/final_predictions/scz2.3.tsv', sep='\t')

In [60]:
val = pd.read_csv('/home/nikita/Desktop/triple_phe/IBD/validation_list.tsv', sep='\t')

In [24]:
LR['Probability'] = LR.Probability *100

In [25]:
perform_qc(LR, val)

qc_coef: 1.0574783219926274
opt_threshols: 0.1


[1.0386518605908752,
 1.0574783219926274,
 0.9452793192718306,
 0.9624083090648633,
 0.9015220132997022,
 0.9234927755034618,
 0.9357765686540404,
 0.9268297444369594,
 0.8934790708222642,
 0.8879086484202531,
 0.8534222145517825,
 0.8298388302612729,
 0.81542128950178,
 0.7963284658418132,
 0.8050492526953567,
 0.7683939545785847,
 0.7486427769234315,
 0.756861063464837,
 0.7256720228770672,
 0.7175928011532571,
 0.7248008959681433,
 0.7383065027253137,
 0.7243513778667106,
 0.7053384363652222,
 0.723107093579242,
 0.7244603222864094,
 0.7226744704044945,
 0.7242517469379215,
 0.7234398299464221,
 0.6991991231950562,
 0.6783559423108887,
 0.6203279785580786,
 0.6257154423917096,
 0.6163706495387646,
 0.5888797958946121,
 0.5926741244764073,
 0.5868379832131457,
 0.5953541017030463,
 0.6017045454545453,
 0.5942598149693068,
 0.5884154197525276,
 0.5825710834016482,
 0.5603034859195076,
 0.556030603060306,
 0.5607897808325768,
 0.5664461619072569,
 0.573051948051948,
 0.5781082887700535

In [26]:
perform_qc(SVM, val)

qc_coef: 1.410963718707008
opt_threshols: 32


[1.0051506567087303,
 1.0051506567087303,
 1.0051506567087303,
 1.0054095826893354,
 1.0583660416118035,
 1.0552546889593462,
 1.0499692553199278,
 1.0440717889797226,
 1.0378775570084262,
 1.0381847122487364,
 1.0391072697536967,
 1.0418848062632848,
 1.036299735121604,
 1.0251532523032092,
 1.0063821282101357,
 1.0106131641845928,
 1.0154924069808973,
 1.0148342304423583,
 1.0086043917145664,
 1.0064129862135645,
 1.0128465642455329,
 1.012511455932845,
 1.0160341222762033,
 0.998827443199024,
 1.031686135692252,
 1.0954968607403874,
 1.082261313335521,
 1.0645519216503703,
 1.056118252051684,
 1.0499776932515752,
 1.1654821865107714,
 1.2228689508147856,
 1.2840560291017502,
 1.410963718707008,
 1.4041089498379284,
 1.3930446727437682,
 1.3536855039321731,
 1.3592740338347673,
 1.2962651137417776,
 1.3108432253623001,
 1.2929527878576357,
 1.2090015634494486,
 1.2050581651799324,
 1.207517629247685,
 1.1871148897915738,
 1.1523068514765342,
 1.1403613617754227,
 1.1097668240850058,


In [27]:
perform_qc(GPR, val)

qc_coef: 1.5629219604720264
opt_threshols: 38


[1.0051506567087303,
 1.0051506567087303,
 1.006965944272446,
 1.0085037678195008,
 1.0344156271510168,
 1.059107032712386,
 1.0755586554524885,
 1.0625583493260229,
 1.0712322977020445,
 1.0604797900755394,
 1.0396863931523022,
 1.0481338091310626,
 1.0557737713628637,
 1.0954780934836494,
 1.1118039744474821,
 1.1400560818662955,
 1.1377940094144343,
 1.1950818996185932,
 1.2284798606214125,
 1.2529122620843898,
 1.2787999778499468,
 1.3007817169255471,
 1.3224225120511783,
 1.3324604447046209,
 1.3738162569676802,
 1.3890096372720844,
 1.3943554036941597,
 1.44992285722414,
 1.4889058587516382,
 1.48902956437888,
 1.4838388619067122,
 1.4775686720101828,
 1.5107821440187554,
 1.4993208355791963,
 1.5217314708063683,
 1.500550055262005,
 1.5208221713506402,
 1.5485406799414783,
 1.5536470381428908,
 1.5629219604720264,
 1.496927858104341,
 1.508082000643984,
 1.531956649730696,
 1.4672903264694863,
 1.4268024243673596,
 1.3824660434016987,
 1.3902436301437617,
 1.414110473579706,
 1.

In [62]:
TOPP['Probability'] = list(reversed(list(TOPP.Rank.values/100)))
TOPP['gene_symbol'] = TOPP.GeneSymbol

In [63]:
perform_qc(TOPP, val)

qc_coef: 1.6741043481830404
opt_threshols: 16.37


[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0776177411979881,
 0.0698559670781893,
 0.06350542461653573,
 0.05821330589849108,
 0.053735359290914844,
 0.04989711934156378,
 0.04657064471879287,
 0.04365997942386831,
 0.04109174534011135,
 0.03880887059899405,
 0.036766298462204895,
 0.03492798353909465,
 0.03326474622770919,
 0.031752712308267865,
 0.030372159599212738,
 0.11642661179698216,
 0.11176954732510287,
 0.10747071858182969,
 0.10349032159731748,
 0.09979423868312756,
 0.09635305803888179,
 0.09314128943758573,
 0.09013673171379265,
 0.08731995884773662,
 0.08467389948871429,
 0.0821834906802227,
 0.07983539094650205,
 0.0776177411979881,
 0.07551996440885328,
 0.07353259692440979,
 0.07164714572121979,
 0.1571759259259259,
 0.1533423667570009,
 0.14969135802469136,
 0.1462101636520241,
 0.2540216984661429,
 0.24837677183356194,
 0.2429772767937019,
 0.23780754750021887,
 0.23285322359396432,
 0.22810111699000585,
 0.22353909465020574,
 0.2191559751472605,
 0.214941437