**Calculate Dxy p values using a resampling test for S. cerevisiae populations (Peter et al., 2018) and European S. paradoxus (Bergstrom et al., 2014).**

In [1]:
import pandas as pd
import numpy as np
import csv
from resample import resample_med

In [2]:
#essential genes - from Winzeler et al., 1999
essential = pd.read_csv('/Users/clairedubin/sacc/external_datasets/essential.csv', header=None)
essential[1] = essential[1].str.strip('\t')
essential_genes = [i.split(' ')[0] for i in essential[1]]

In [3]:
gene_dict = {'YLR397C':'AFG2',
             'YGR098C':'ESP1',
             'YMR168C':'CEP3',
             'YKR054C': 'DYN1',
             'YHR023W':'MYO1',
             'YDR180W':'SCC2',
             'YPL174C':'NIP100',
             'YCR042C': 'TAF2',
             'YMR016C':'SOK2',
             'YJR135C':'MCM22',
             'YJL025W':'RRN7',
             'YDR443C':'SSN2',
             'YKL134C':'OCT1',
             'YPR164W':'MMS1',}

In [4]:
#group Scer strains by population

pops = pd.read_csv('/Users/clairedubin/sacc/external_datasets/1011_pops_tableS1.csv', skiprows=3)
pops = pops[['Standardized name', 'Clades']].dropna()
pops['population'] = pops['Clades'].str.replace('[^\w]','')

pop_dict = pops.groupby('population')['Standardized name'].count().to_dict()
pop_dict

  pops['population'] = pops['Clades'].str.replace('[^\w]','')


{'10FrenchGuianahuman': 31,
 '11Alebeer': 18,
 '12WestAfricancocoa': 13,
 '13Africanpalmwine': 28,
 '14CHNIII': 2,
 '15CHNII': 2,
 '16CHNI': 1,
 '17Taiwanese': 3,
 '18FarEastAsia': 9,
 '19Malaysian': 6,
 '1WineEuropean': 268,
 '1WineEuropeansubclade1': 18,
 '1WineEuropeansubclade2': 13,
 '1WineEuropeansubclade3': 24,
 '1WineEuropeansubclade4': 39,
 '20CHNV': 2,
 '21Ecuadorean': 10,
 '22FarEastRussian': 4,
 '23NorthAmericanoak': 13,
 '24Asianislands': 11,
 '25Sake': 47,
 '26Asianfermentation': 39,
 '2Alpechin': 17,
 '3Brazilianbioethanol': 35,
 '4Mediterraneanoak': 8,
 '5Frenchdairy': 32,
 '6Africanbeer': 20,
 '7Mosaicbeer': 21,
 '8Mixedorigin': 72,
 '9Mexicanagave': 7,
 'M1Mosaicregion1': 17,
 'M2Mosaicregion2': 20,
 'M3Mosaicregion3': 113}

In [5]:
#load raw dxy data

all_dxy = pd.read_csv('/Users/clairedubin/sacc/carly_genes/dxy_1011pops_EuroSpar.csv', header=None)
all_dxy.columns = ['population', 'gene', 'dxy', 'spar_strain_count', 'scer_strain_count']
all_dxy.head()

Unnamed: 0,population,gene,dxy,spar_strain_count,scer_strain_count
0,19Malaysian,YLR457C,0.131458,10,6
1,M1Mosaicregion1,YLR457C,0.134645,10,17
2,19Malaysian,YLR129W,0.09049,10,6
3,M2Mosaicregion2,YLR457C,0.134531,10,20
4,19Malaysian,YIL093C,0.072704,10,6


In [None]:
#drop any rows where spar_strain_count < 8 or scer_strain count < 75% of the population
all_dxy = all_dxy[all_dxy['spar_strain_count'] >= 8]

### Resample Dxy by Scer population

In [8]:
np.random.seed(777)

p_dict = {}

for pop in pop_dict:
    size = pop_dict[pop]
    
    df = all_dxy[all_dxy['population']==pop]
    df = df[df['scer_strain_count']>= .75*size]
    
    candidates = df[df['gene'].isin(gene_dict.keys())]
    
    print('')
    print('---- {} ----'.format(pop))

    print('missing: ', [gene_dict[i] for i in gene_dict.keys() if i not in candidates['gene'].tolist()])
    
    p = resample_med(candidates, df, 'dxy', essential_genes, direction='greater_than')
    p_dict[pop] = [df['dxy'].median(), candidates['dxy'].median(), df.shape[0], p]    


---- 10FrenchGuianahuman ----
missing:  []
candidate gene median dxy: 0.12131835182103609
essential count: 6; nonessential_count: 8
resampling pool size: 4570
p = 0.004

---- 11Alebeer ----
missing:  []
candidate gene median dxy: 0.11847805744303011
essential count: 6; nonessential_count: 8
resampling pool size: 4562
p = 0.004

---- 12WestAfricancocoa ----
missing:  []
candidate gene median dxy: 0.12033658386278165
essential count: 6; nonessential_count: 8
resampling pool size: 4569
p = 0.0032

---- 13Africanpalmwine ----
missing:  []
candidate gene median dxy: 0.12097061883124041
essential count: 6; nonessential_count: 8
resampling pool size: 4562
p = 0.0031

---- 14CHNIII ----
missing:  []
candidate gene median dxy: 0.1206012060674587
essential count: 6; nonessential_count: 8
resampling pool size: 4574
p = 0.0037

---- 15CHNII ----
missing:  []
candidate gene median dxy: 0.11991804427010361
essential count: 6; nonessential_count: 8
resampling pool size: 4573
p = 0.0045

---- 16CHNI 

In [10]:
with open('dxy_p_vals_by_scer_pop_Eurospar.csv', 'w') as f:
    w = csv.writer(f, delimiter=',')
    w.writerow(['S. cerevisiae population', 'Genomic median Dxy', 'Candidate genes median Dxy', 'Resampling pool size', 'Resampling p value'])

    for pop in p_dict:
        w.writerow([pop]+p_dict[pop])