In [28]:
from scipy import stats
from statsmodels.sandbox.stats import multicomp
import pandas as pd

In [46]:
def run(study, pop, gene_set, adjust='fdr_bh'):
    '''
    Run a Over-represent analysis toward a gene set

    :param study: the significant gene set
    :param pop:  the background gene set
    :param gene_set: the function set
    :param adjust: the adjust method in the multiple tests, 
        details in http://www.statsmodels.org/dev/generated/statsmodels.sandbox.stats.multicomp.multipletests.html
    :return: the ORA analysis result
    '''
    gene_sets = gene_set if type(gene_set) == dict else GMTUtils.parse_gmt_file(gene_set)
    mapped = {k: list(set(v) & set([str(x) for x in pop])) for k, v in gene_sets.items()}
    s_mapped = {k: list(set(v) & set([str(x) for x in study])) for k, v in gene_sets.items()}
    result = {}
    for k, v in mapped.items():
        result[k] = stats.hypergeom.sf(len(s_mapped[k]) - 1, len(pop), len(mapped[k]), len(study))
    _, o, _, _ = multicomp.multipletests(list(result.values()), method=adjust)
    rfdr = {list(result.keys())[i]: o[i] for i in range(len(list(result.keys())))}
    # !
    df_result = {'name': [], 'mapped': [], 'number in study': [], 'p-value': [], 'fdr': []}
    for k, v in mapped.items():
        df_result['name'].append(k)
        df_result['mapped'].append(len(mapped[k]))
        df_result['number in study'].append(len(s_mapped[k]))
        df_result['p-value'].append(result[k])
        df_result['fdr'].append(rfdr[k])
    df = pd.DataFrame(df_result)
    df = df[['name', 'mapped', 'number in study', 'p-value', 'fdr']]
    return df, study, pop, adjust
    
def parse_gmt_file(file):
    '''
    parse a local gmt file,
    the file should be presented like:
    setName\tsource[optional]\tgenes....

    :param file: the file path
    :return: the parsed dict
    '''
    with open(file) as fp:
        con = fp.read()
    return {x.split('\t')[0]: [t for t in x.split('\t')[2:] if t] for x in con.split('\n') if x}

In [3]:
data_path = ""
with open(data_path + 'pypathway_all') as fp:
    bg = fp.read()
with open(data_path + 'pypathway_de') as fp:
    de = fp.read()
background = set([int(x.split(" ")[1][1:-1]) for x in bg.split('\n') if x])
deg = {int(x.split(" ")[0][1:-1]): float(x.split(" ")[1]) for x in de.split('\n') if x}
deg_list = list(deg.keys())
idtype = 'ENTREZID'

In [42]:
gmt = parse_gmt_file("pypathway.gmt")
len(gmt)

50

In [37]:
background = set(map(str ,background))
background

{'25886',
 '1033',
 '170370',
 '26020',
 '151278',
 '2863',
 '3038',
 '491',
 '2294',
 '1741',
 '259286',
 '151473',
 '6197',
 '10058',
 '158696',
 '6835',
 '79674',
 '8721',
 '23022',
 '10475',
 '57701',
 '2030',
 '84266',
 '5799',
 '1641',
 '127703',
 '163590',
 '6872',
 '9247',
 '400954',
 '271',
 '308',
 '440584',
 '10786',
 '5931',
 '1181',
 '495',
 '116988',
 '58189',
 '79725',
 '64900',
 '83590',
 '157769',
 '131034',
 '152926',
 '56649',
 '7419',
 '10232',
 '83593',
 '883',
 '54496',
 '954',
 '1303',
 '26339',
 '219293',
 '133688',
 '60491',
 '29095',
 '51706',
 '3161',
 '1358',
 '205251',
 '282809',
 '285313',
 '284161',
 '79961',
 '5118',
 '144402',
 '4313',
 '1305',
 '83999',
 '25894',
 '4133',
 '151742',
 '221184',
 '57695',
 '6390',
 '494514',
 '196264',
 '8556',
 '753',
 '3748',
 '81704',
 '84876',
 '25943',
 '128611',
 '165829',
 '124045',
 '51214',
 '221035',
 '10558',
 '79174',
 '5972',
 '51063',
 '23214',
 '54729',
 '23414',
 '59339',
 '23505',
 '220963',
 '53342',
 '

In [4]:
deg

{3491: 5.96020626631229,
 2353: 5.14350174656242,
 1958: 4.1480811895306,
 1843: 2.42988865594274,
 3725: 1.53112566839074,
 23645: 1.42926930199389,
 9510: 3.9376626046503,
 84869: -1.14707712495062,
 7432: 4.71576732377728,
 1490: 3.44760390387296,
 4256: 3.5773362635042,
 8406: 2.62419966350463,
 2354: 3.46760002323969,
 151887: 2.80897066093399,
 5145: -1.4834346334758,
 3726: 1.22023453120884,
 9592: 0.943232448330878,
 7538: 1.4563435295322,
 9971: -2.4462079610495,
 1831: 2.27285845326819,
 84870: 2.26551146930371,
 1555: -1.20182218642364,
 51091: -1.26416021425668,
 5996: 3.46551782158953,
 710: 1.76671493785752,
 388: 2.49381539190454,
 56892: 1.9367019068624,
 1805: 2.93860965333385,
 7345: 2.6384198763819,
 3488: 2.89121076305059,
 4162: 1.94190697404642,
 200931: -3.92110912468378,
 1051: 0.799534717178696,
 125: 3.24858994652715,
 2006: 1.02721759504585,
 8975: 1.07567683959426,
 2192: 2.06296877929945,
 10894: 1.97978748027101,
 51523: 0.785461138791093,
 5376: 2.0653050

In [40]:
deg_list = list(map(str, deg_list))
deg_list

['3491',
 '2353',
 '1958',
 '1843',
 '3725',
 '23645',
 '9510',
 '84869',
 '7432',
 '1490',
 '4256',
 '8406',
 '2354',
 '151887',
 '5145',
 '3726',
 '9592',
 '7538',
 '9971',
 '1831',
 '84870',
 '1555',
 '51091',
 '5996',
 '710',
 '388',
 '56892',
 '1805',
 '7345',
 '3488',
 '4162',
 '200931',
 '1051',
 '125',
 '2006',
 '8975',
 '2192',
 '10894',
 '51523',
 '5376',
 '59',
 '83643',
 '23408',
 '594',
 '2260',
 '3164',
 '6423',
 '57493',
 '79819',
 '4969',
 '6347',
 '5350',
 '9847',
 '5740',
 '1277',
 '23362',
 '5176',
 '146556',
 '6566',
 '11167',
 '51673',
 '91851',
 '85477',
 '387763',
 '220972',
 '1728',
 '8522',
 '79567',
 '2669',
 '81831',
 '5348',
 '7078',
 '2200',
 '6376',
 '4854',
 '55500',
 '1278',
 '6876',
 '10365',
 '55089',
 '11170',
 '26585',
 '51361',
 '55357',
 '79717',
 '2954',
 '5654',
 '730',
 '9064',
 '126859',
 '54507',
 '25960',
 '1457',
 '7077',
 '23017',
 '26577',
 '10590',
 '9963',
 '54583',
 '51063',
 '51099',
 '1634',
 '145270',
 '171425',
 '7145',
 '4628',
 '2

In [29]:
run(deg_list,background,gmt)[0]

Unnamed: 0,name,mapped,number in study,p-value,fdr
0,HALLMARK_TNFA_SIGNALING_VIA_NFKB,177,88,1.187314e-07,3.957714e-07
1,HALLMARK_HYPOXIA,180,98,3.951669e-11,2.82262e-10
2,HALLMARK_CHOLESTEROL_HOMEOSTASIS,64,37,7.346322e-06,1.933243e-05
3,HALLMARK_MITOTIC_SPINDLE,182,77,0.0007125914,0.001370368
4,HALLMARK_WNT_BETA_CATENIN_SIGNALING,35,12,0.3927967,0.4790204
5,HALLMARK_TGF_BETA_SIGNALING,46,26,0.0002732707,0.0005940668
6,HALLMARK_IL6_JAK_STAT3_SIGNALING,78,29,0.1405347,0.1899117
7,HALLMARK_DNA_REPAIR,134,35,0.9041719,0.961885
8,HALLMARK_G2M_CHECKPOINT,177,50,0.8013938,0.8904376
9,HALLMARK_APOPTOSIS,146,74,4.48256e-07,1.4008e-06
