In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import seaborn as sns
from scipy.stats import fisher_exact

In [2]:
df = pd.read_csv('../tables/tableS1_sequence_stats_by_gene.csv')
df = df.rename(columns={'Unnamed: 0':'gene'})
df.head()

Unnamed: 0,gene,pi_AZ,pi_TXMXSA,DXY,PnPs
0,D8B26_000001,0.013427,,,0.90704
1,D8B26_000002,0.004111,,,0.430434
2,D8B26_000003,0.00643,,,0.524033
3,D8B26_000004,0.003481,,,0.270093
4,D8B26_000006,,,,0.652013


In [3]:
def resample(gene_list, df, resample_cols, n=10000):
    
    for col in resample_cols:
        
        resample_col = df[df[col]!=np.inf][col].dropna()
        candidate_col = resample_col[resample_col.index.isin(gene_list)]
        resample_size = candidate_col.shape[0]
        candidate_median = candidate_col.median()
        resample_pool_size = resample_col.shape[0]
        
        
        print('candidate median {}: {}'.format(col, candidate_median))
        print('genomic median {}: {}'.format(col, resample_col.median()))
        print('# genes to resample: {}'.format(resample_size))
        print('resample pool size: {}'.format(resample_pool_size))


        p = 0
        for i in range(n):

            sample = resample_col.sample(n=resample_size)
            if sample.median() >= candidate_median:
                p += 1

        print("p = {}".format(p/n))
        print()


In [4]:
exp = pd.read_csv('../ref/CpSilv_gb_annotations_and_expression_data.csv').set_index('Unnamed: 0')
exp = exp[~exp['Beyhan_exp'].isna()]
spherule_genes = exp[(exp['Beyhan_exp']=='spherule_upreg')].index.tolist()

#cut to genes with expression data
exp_df = df[df['gene'].isin(list(exp.index))].set_index('gene')

len(spherule_genes), len(exp_df)

(1082, 6998)

In [5]:
np.random.seed(444)

resample(spherule_genes, exp_df, ['pi_AZ', 'pi_TXMXSA', 'DXY',  'pi_TXMXSA', 'PnPs'])

candidate median pi_AZ: 0.0010586606585412
genomic median pi_AZ: 0.00093545551432545
# genes to resample: 971
resample pool size: 6690
p = 0.0

candidate median pi_TXMXSA: 0.0007454959618968
genomic median pi_TXMXSA: 0.0007069479730476
# genes to resample: 995
resample pool size: 6861
p = 0.0379

candidate median DXY: 0.00115346536927795
genomic median DXY: 0.00105619392137835
# genes to resample: 952
resample pool size: 6594
p = 0.0002

candidate median pi_TXMXSA: 0.0007454959618968
genomic median pi_TXMXSA: 0.0007069479730476
# genes to resample: 995
resample pool size: 6861
p = 0.0404

candidate median PnPs: 0.34245877535913505
genomic median PnPs: 0.3245855510080486
# genes to resample: 748
resample pool size: 5278
p = 0.0099



In [6]:
np.random.seed(444)

resample(spherule_genes, exp_df, ['pi_AZ'], n=10000000)

candidate median pi_AZ: 0.0010586606585412
genomic median pi_AZ: 0.00093545551432545
# genes to resample: 971
resample pool size: 6690
p = 3e-07

