In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import seaborn as sns
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

In [22]:
with open('../ref/genbank_files/pfam_domains.json') as f:
    pfam_dict = json.load(f)
    
pfam_dict_rev = {}

for gene, pfams in pfam_dict.items():
    
    for p in pfams:
        
        if p not in pfam_dict_rev:
            pfam_dict_rev[p] = [gene]
            
        else:
            pfam_dict_rev[p] += [gene]

In [23]:
def resample(gene_list, df, resample_cols, n=10000, noisy=False):
    
    p_list = []
    
    for col in resample_cols:
        
        resample_col = df[df[col]!=np.inf][col].dropna()
        candidate_col = resample_col[resample_col.index.isin(gene_list)]
        resample_size = candidate_col.shape[0]
        candidate_median = candidate_col.median()
        resample_pool_size = resample_col.shape[0]
        
        if noisy:
            print('candidate median {}: {}'.format(col, candidate_median))
            print('genomic median {}: {}'.format(col, resample_col.median()))
            print('# genes to resample: {}'.format(resample_size))
            print('resample pool size: {}'.format(resample_pool_size))

        p = 0
        for i in range(n):

            sample = resample_col.sample(n=resample_size)
            if sample.median() >= candidate_median:
                p += 1
        
        if noisy:
            print("p = {}".format(p/n))
            print()
        
        p_list += [p/n]
        
    return p_list  

In [24]:
df = pd.read_csv('../tables/tableS1_sequence_stats_by_gene.csv')
df = df.set_index('Unnamed: 0')
df.head()

Unnamed: 0_level_0,pi_AZ,pi_TXMXSA,DXY,PnPs
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
D8B26_000001,0.013427,,,0.90704
D8B26_000002,0.004111,,,0.430434
D8B26_000003,0.00643,,,0.524033
D8B26_000004,0.003481,,,0.270093
D8B26_000006,,,,0.652013


In [25]:
pfam = 'PFAM:PF01636'
pfam_genes = pfam_dict_rev[pfam]


np.random.seed(444)
resample(pfam_genes, df, ['pi_AZ', 'DXY', 'pi_TXMXSA', 'PnPs'], noisy=True)


candidate median pi_AZ: 0.0017637281226261
genomic median pi_AZ: 0.0009547460527852
# genes to resample: 27
resample pool size: 7509
p = 0.0001

candidate median DXY: 0.0019837310772341
genomic median DXY: 0.0010734330587271
# genes to resample: 27
resample pool size: 7391
p = 0.0

candidate median pi_TXMXSA: 0.0012299140553721
genomic median pi_TXMXSA: 0.0007203179334326
# genes to resample: 29
resample pool size: 7699
p = 0.0026

candidate median PnPs: 0.5830364566081423
genomic median PnPs: 0.32850767233678113
# genes to resample: 20
resample pool size: 5818
p = 0.0077



[0.0001, 0.0, 0.0026, 0.0077]

In [26]:
resample(pfam_genes, df, ['DXY'], noisy=True, n=100000)

candidate median DXY: 0.0019837310772341
genomic median DXY: 0.0010734330587271
# genes to resample: 27
resample pool size: 7391
p = 8e-05



[8e-05]