In [1]:
import pandas as pd
from statsmodels.stats.contingency_tables import Table2x2

In [2]:
supp = pd.read_csv('expression_rare_variant_list.txt', sep='\t')
# drop empty rows
supp = supp[~(supp.Sample.isna())].copy()

In [3]:
# 13 carrier offspring of trios
samples = ['FC_01', 'P1C_01', 'P1C_04', 'P2C_04', 'MC_05', 'P1C_05', 'P2C_05', 'M2C_07', 'M1C_07', 'P2C_07', 'P3C_07', 'P1C_07', 'P2C_52']

In [4]:
# get all variants
variants = list(supp.columns)
variants = variants[9:]
print(variants)

['LOF', 'Splicing', 'Missense', 'Promoter', 'Enhancer', 'Silencer', "5' UTR", "3' UTR", 'Upstream', 'Downstream', 'Intron', 'Dup. interstitial', 'Dup. encapsulated', "Dup. 5' UTR", "Dup. 3' UTR", 'Del. interstitial', 'Del. encapsulated', "Del. 5' UTR", "Del. 3' UTR", 'STR intronic', 'STR upstream', "STR 5' UTR", 'STR exonic', "STR 3' UTR", 'STR downstream']


In [5]:
# create stats list to populate
stats = []

# get supplemental table for 13 sample trios
subsupp = supp[supp.Sample.isin(samples)]
    
# for each variant get contingency table
for variant in variants:
    variants_and_exp_change       = subsupp[(subsupp[variant] > 0) & (subsupp['Alt. splicing'] != '.')].shape[0]
    variants_and_no_exp_change    = subsupp[(subsupp[variant] > 0) & (subsupp['Alt. splicing'] == '.')].shape[0]
    no_variants_and_exp_change    = subsupp[(subsupp[variant] == 0) & (subsupp['Alt. splicing'] != '.')].shape[0]
    no_variants_and_no_exp_change = subsupp[(subsupp[variant] == 0) & (subsupp['Alt. splicing'] == '.')].shape[0]

    stats.append(['All', variant, variants_and_exp_change, variants_and_no_exp_change,
                no_variants_and_exp_change,no_variants_and_no_exp_change])
        
# stats to dataframe
stats = pd.DataFrame(stats, columns=['group', 'variant', 'variants_and_exp_change', 'variants_and_no_exp_change',
                    'no_variants_and_exp_change','no_variants_and_no_exp_change'])

In [6]:
# get log odds ratios

for i, row in stats.iterrows():
    variants_and_exp_change       = row['variants_and_exp_change']
    variants_and_no_exp_change    = row['variants_and_no_exp_change']
    no_variants_and_exp_change    = row['no_variants_and_exp_change']
    no_variants_and_no_exp_change = row['no_variants_and_no_exp_change']
        
    cont = Table2x2([[variants_and_exp_change,no_variants_and_exp_change], 
                     [variants_and_no_exp_change,no_variants_and_no_exp_change]])
    
    stats.at[i, 'log odds ratio conf. lower'] = cont.log_oddsratio_confint()[0]
    stats.at[i, 'log odds ratio'] = cont.log_oddsratio
    stats.at[i, 'log odds ratio conf. upper'] = cont.log_oddsratio_confint()[1]
    stats.at[i, 'log odds ratio SE'] = cont.log_oddsratio_se
    stats.at[i, 'log odds ratio pvalue'] = cont.log_oddsratio_pvalue()

In [7]:
# set order
stats = stats.set_index('variant', drop=False)

variant_order = ['LOF', 'Missense', 'Splicing', '5\' UTR', 'Upstream', '3\' UTR', 'Downstream', 'Intron',
                'Promoter', 'Enhancer', 'Silencer',
                'Del. encapsulated', 'Del. interstitial', 'Del. 5\' UTR', 'Del. 3\' UTR',
                'Dup. encapsulated', 'Dup. interstitial', 'Dup. 5\' UTR', 'Dup. 3\' UTR',
                'STR exonic', 'STR intronic', 'STR 5\' UTR', 'STR upstream', 'STR 3\' UTR', 'STR downstream'
                ]

In [8]:
# get significance
def get_sig(p):
    if p < 0.05:
        return '*'
    return 'ns'

stats['significance'] = stats['log odds ratio pvalue'].apply(get_sig)

In [9]:
stats.to_csv('splicing_log_odds.tsv', sep='\t', index=False)

In [10]:
stats[['variant', 'log odds ratio pvalue', 'significance']]

Unnamed: 0_level_0,variant,log odds ratio pvalue,significance
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LOF,LOF,0.0679572,ns
Splicing,Splicing,0.0007470814,*
Missense,Missense,0.01185734,*
Promoter,Promoter,0.469426,ns
Enhancer,Enhancer,0.006173412,*
Silencer,Silencer,0.316127,ns
5' UTR,5' UTR,0.2462985,ns
3' UTR,3' UTR,0.2887841,ns
Upstream,Upstream,0.05958046,ns
Downstream,Downstream,0.2380287,ns
