In [1]:
import pandas as pd

In [2]:
# load in annovar annotated GTEx eQTLs
gtex = pd.read_csv('gtex_eqtls_annotated_gts.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# load in original gtex with gene id
gtex_orig = pd.read_csv('/data5/16p12_RNA/GTEx/data/GTEx_Analysis_v7_eQTL/Cells_EBV-transformed_lymphocytes.v7.signif_variant_gene_pairs.txt.gz', sep='\t')

# add eGENE to gtex df
gtex['gene_id'] = gtex_orig['gene_id']

In [4]:
# load in gene name map
mapp = pd.read_csv('../16p12.2_rnaseq_analysis/data/gene_names_mapping_new.tsv', sep='\t')
mapp = mapp.set_index('Name', drop=False)

# keep only some columns
cols = ['Name', 'Description']
mapp = mapp[cols].copy()

# add hgnc symbol to gtex df

gtex['gene'] = gtex['gene_id'].apply(lambda s: mapp.loc[s, 'Description'])

In [5]:
# load in annotations
anno = pd.read_csv('../16p12.2_rnaseq_analysis/outlier_expression_analysis/gene_annotations.txt', sep='\t')

# keep only some columns
cols = ['#HGNCsymbol', 'pLI', 'pLI_%', 'BrainExpressed_Kang2011', 'Developmental_delay_DDD',
       'DBD', 'DDDG2P', 'SFARI_Gene', 'Purcell_Schiz', 'NDD_all', 'OMIM_phenotype']

anno = anno[cols].copy()

# set index
anno = anno.set_index('#HGNCsymbol', drop=False)

In [6]:
# make list of genes with annotations
hgnc_symbols = list(anno.index)

In [10]:
def get_pli(gene):
    if gene in hgnc_symbols:
        pli = anno.loc[gene, 'pLI']
        if pli == '.':
            return False
        pli = float(pli)
        if pli > 0.9:
            return True
    return False

mapp['pLI'] = mapp['Description'].apply(get_pli)

In [12]:
def is_brain_expressed(gene):
    if gene in hgnc_symbols:
        val = anno.loc[gene, 'BrainExpressed_Kang2011']
        if val == '1':
            return True
    return False

mapp['BrainExpressed_Kang2011'] = mapp['Description'].apply(is_brain_expressed)

In [17]:
def is_disorder_gene(gene):
    if gene in hgnc_symbols:
        val = anno.loc[gene, 'Developmental_delay_DDD']
        if val == '1':
            return True
    return False

mapp['Developmental_delay_DDD'] = mapp['Description'].apply(is_disorder_gene)

In [20]:
def is_disorder_gene(gene):
    if gene in hgnc_symbols:
        val = anno.loc[gene, 'DBD']
        if val != '.':
            return True
    return False

mapp['DBD'] = mapp['Description'].apply(is_disorder_gene)

In [27]:
def is_disorder_gene(gene):
    if gene in hgnc_symbols:
        val = anno.loc[gene, 'DDDG2P']
        if val == '1':
            return True
    return False

mapp['DDDG2P'] = mapp['Description'].apply(is_disorder_gene)

In [26]:
def is_disorder_gene(gene):
    if gene in hgnc_symbols:
        val = anno.loc[gene, 'SFARI_Gene']
        if val != '.':
            return True
    return False

mapp['SFARI_Gene'] = mapp['Description'].apply(is_disorder_gene)

In [31]:
def is_disorder_gene(gene):
    if gene in hgnc_symbols:
        val = anno.loc[gene, 'Purcell_Schiz']
        if val == '1':
            return True
    return False

mapp['Purcell_Schiz'] = mapp['Description'].apply(is_disorder_gene)

In [40]:

cols = ['pLI', 'BrainExpressed_Kang2011',
       'Developmental_delay_DDD', 'DBD', 'DDDG2P', 'SFARI_Gene',
       'Purcell_Schiz']
for col in cols:
    gtex[col] = gtex['gene_id'].apply(lambda s: mapp.loc[s, col])

In [45]:
samples = [s for s in gtex.columns if s.startswith('SG')]

# create dict to store values
d = pd.DataFrame(index=samples)
d['all'] = 0
for col in cols:
    d[col] = 0

# iterate through all gtex eqtls
for i, row in gtex.iterrows():
    # get alt allele defined by gtex
    alt = row['Alt']
    
    # get frequency of alternat allele
    freq_alt = row['gnomAD_genome_ALL']
    if freq_alt == '.':
        freq_alt = 0
    else:
        freq_alt = float(freq_alt)
    
    # get allele number of alternate allele
    vcf_alt = row['vcf_alt']
    vcf_alt = vcf_alt.split(',')
    alt_number = 0
    for j in range(len(vcf_alt)):
        allele = vcf_alt[j]
        if allele == alt:
            alt_number = j + 1

    # if alt allele not in vcf then skip
    if alt_number == 0:
        continue
    
    # is alt of ref the minor allele?
    if freq_alt < 0.5:
        maf_number = str(alt_number)
    else:
        maf_number = str(0)
    
    # for each sample, does it have the minor allele
    for samp in samples:
        gt = row[samp]
        has_minor_eqtl = maf_number in gt
        if has_minor_eqtl:
            d.at[samp, 'all'] = d.at[samp, 'all'] + 1
            
            # also add to disease gene sets
            for col in cols:
                if row[col]:
                    d.at[samp, col] = d.at[samp, col] + 1
        

In [49]:
d.index.name = 'sample'
d.to_csv('minor_allele_summary_disease.tsv', sep='\t')

In [51]:
d

Unnamed: 0_level_0,all,pLI,BrainExpressed_Kang2011,Developmental_delay_DDD,DBD,DDDG2P,SFARI_Gene,Purcell_Schiz
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SG001,104471,4936,52433,2418,1439,8117,1417,6513
SG003,104984,5015,50592,2438,1453,7552,1385,7423
SG006,101937,5146,51030,2224,1348,7097,1311,6416
SG007,102677,5147,50787,2198,1981,7592,2056,6582
SG002,106468,5611,50768,2477,1731,8283,1488,6810
SG011,108101,4314,51752,2452,1651,7965,1281,6294
SG025,101238,5150,48417,2149,1332,7526,2034,7260
SG022,105410,5202,49680,2325,2407,7899,1697,6479
SG024,104920,5292,49855,2192,2656,8041,1328,6883
SG021,104531,5821,50708,2038,2508,7224,2260,7574
