In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.contingency_tables import Table2x2

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
supp = pd.read_csv('expression_rare_variant_list.txt', sep='\t')
# drop empty rows
supp = supp[~(supp.Sample.isna())].copy()

In [3]:
# load in relationships
rel = pd.read_csv('../data/family_summaries.tsv', sep='\t')
rel = rel[rel.subject != 'SG011']
rel = rel.set_index('subject', drop=False)

# load in phenotypes
pheno = pd.read_csv('../data/pheno_final.tsv', sep='\t')
pheno = pheno.drop_duplicates('subject')
pheno = pheno.set_index('subject')

In [4]:
# 13 carrier offspring of trios
samples = ['FC_01', 'P1C_01', 'P1C_04', 'P2C_04', 'MC_05', 'P1C_05', 'P2C_05', 'M2C_07', 'M1C_07', 'P2C_07', 'P3C_07', 'P1C_07', 'P2C_52']

In [5]:
# get all variants
variants = list(supp.columns)
variants = variants[9:]
print(variants)

['LOF', 'Splicing', 'Missense', 'Promoter', 'Enhancer', 'Silencer', "5' UTR", "3' UTR", 'Upstream', 'Downstream', 'Intron', 'Dup. interstitial', 'Dup. encapsulated', "Dup. 5' UTR", "Dup. 3' UTR", 'Del. interstitial', 'Del. encapsulated', "Del. 5' UTR", "Del. 3' UTR", 'STR intronic', 'STR upstream', "STR 5' UTR", 'STR exonic', "STR 3' UTR", 'STR downstream']


In [6]:
# group, variant, variant and exp. change, variant and not exp. change, not variant and exp. change, not variant and not exp. change
stats = []

# get supplemental table for 13 sample trios
subsupp = supp[supp.Sample.isin(samples)]
    
# for each variant get contingency table
for variant in variants:
    variants_and_exp_change       = subsupp[(subsupp[variant] > 0) & (subsupp['Diff. expression'] != '.')].shape[0]
    variants_and_no_exp_change    = subsupp[(subsupp[variant] > 0) & (subsupp['Diff. expression'] == '.')].shape[0]
    no_variants_and_exp_change    = subsupp[(subsupp[variant] == 0) & (subsupp['Diff. expression'] != '.')].shape[0]
    no_variants_and_no_exp_change = subsupp[(subsupp[variant] == 0) & (subsupp['Diff. expression'] == '.')].shape[0]

    stats.append(['All', variant, variants_and_exp_change, variants_and_no_exp_change,
                no_variants_and_exp_change,no_variants_and_no_exp_change])
        
# stats to dataframe
stats = pd.DataFrame(stats, columns=['group', 'variant', 'variants_and_exp_change', 'variants_and_no_exp_change',
                    'no_variants_and_exp_change','no_variants_and_no_exp_change'])

In [7]:
# get log odds ratios

for i, row in stats.iterrows():
    variants_and_exp_change       = row['variants_and_exp_change']
    variants_and_no_exp_change    = row['variants_and_no_exp_change']
    no_variants_and_exp_change    = row['no_variants_and_exp_change']
    no_variants_and_no_exp_change = row['no_variants_and_no_exp_change']
        
    cont = Table2x2([[variants_and_exp_change,no_variants_and_exp_change], 
                     [variants_and_no_exp_change,no_variants_and_no_exp_change]])
    
    stats.at[i, 'log odds ratio conf. lower'] = cont.log_oddsratio_confint()[0]
    stats.at[i, 'log odds ratio'] = cont.log_oddsratio
    stats.at[i, 'log odds ratio conf. upper'] = cont.log_oddsratio_confint()[1]
    stats.at[i, 'log odds ratio SE'] = cont.log_oddsratio_se
    stats.at[i, 'log odds ratio pvalue'] = cont.log_oddsratio_pvalue()

In [8]:
# set order
stats = stats.set_index('variant', drop=False)

variant_order = ['LOF', 'Missense', 'Splicing', '5\' UTR', 'Upstream', '3\' UTR', 'Downstream', 'Intron',
                'Promoter', 'Enhancer', 'Silencer',
                'Del. encapsulated', 'Del. interstitial', 'Del. 5\' UTR', 'Del. 3\' UTR',
                'Dup. encapsulated', 'Dup. interstitial', 'Dup. 5\' UTR', 'Dup. 3\' UTR',
                'STR exonic', 'STR intronic', 'STR 5\' UTR', 'STR upstream', 'STR 3\' UTR', 'STR downstream'
                ]

In [9]:
# get significance
def get_sig(p):
    if p < 0.05:
        return '*'
    return 'ns'

stats['significance'] = stats['log odds ratio pvalue'].apply(get_sig)

In [10]:
stats.to_csv('de_log_odds.tsv', sep='\t', index=False)

In [12]:
stats[['variant', 'log odds ratio pvalue', 'significance']]

Unnamed: 0_level_0,variant,log odds ratio pvalue,significance
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LOF,LOF,0.2509886,ns
Splicing,Splicing,0.7044323,ns
Missense,Missense,1.544703e-05,*
Promoter,Promoter,0.006508953,*
Enhancer,Enhancer,0.755594,ns
Silencer,Silencer,8.92119e-08,*
5' UTR,5' UTR,0.09210643,ns
3' UTR,3' UTR,0.1727539,ns
Upstream,Upstream,0.001458442,*
Downstream,Downstream,0.9934283,ns
