**Calculate Dxy p values using a resampling test for S. cerevisiae populations (Peter et al., 2018) and North American S. paradoxus subpopulation B (Durand et al., 2019).**

In [1]:
import pandas as pd
import numpy as np
import csv
from resample import resample_med

In [2]:
#essential genes - from Winzeler et al., 1999
essential = pd.read_csv('/Users/clairedubin/sacc/external_datasets/essential.csv', header=None)
essential[1] = essential[1].str.strip('\t')
essential_genes = [i.split(' ')[0] for i in essential[1]]

In [3]:
gene_dict = {'YLR397C':'AFG2',
             'YGR098C':'ESP1',
             'YMR168C':'CEP3',
             'YKR054C': 'DYN1',
             'YHR023W':'MYO1',
             'YDR180W':'SCC2',
             'YPL174C':'NIP100',
             'YCR042C': 'TAF2',
             'YMR016C':'SOK2',
             'YJR135C':'MCM22',
             'YJL025W':'RRN7',
             'YDR443C':'SSN2',
             'YKL134C':'OCT1',
            'YPR164W':'MMS1'}

In [4]:
#group Scer strains by population

pops = pd.read_csv('/Users/clairedubin/sacc/external_datasets/1011_pops_tableS1.csv', skiprows=3)
pops = pops[['Standardized name', 'Clades']].dropna()
pops['population'] = pops['Clades'].str.replace('[^\w]','')

pop_dict = pops.groupby('population')['Standardized name'].count().to_dict()
pop_dict

  pops['population'] = pops['Clades'].str.replace('[^\w]','')


{'10FrenchGuianahuman': 31,
 '11Alebeer': 18,
 '12WestAfricancocoa': 13,
 '13Africanpalmwine': 28,
 '14CHNIII': 2,
 '15CHNII': 2,
 '16CHNI': 1,
 '17Taiwanese': 3,
 '18FarEastAsia': 9,
 '19Malaysian': 6,
 '1WineEuropean': 268,
 '1WineEuropeansubclade1': 18,
 '1WineEuropeansubclade2': 13,
 '1WineEuropeansubclade3': 24,
 '1WineEuropeansubclade4': 39,
 '20CHNV': 2,
 '21Ecuadorean': 10,
 '22FarEastRussian': 4,
 '23NorthAmericanoak': 13,
 '24Asianislands': 11,
 '25Sake': 47,
 '26Asianfermentation': 39,
 '2Alpechin': 17,
 '3Brazilianbioethanol': 35,
 '4Mediterraneanoak': 8,
 '5Frenchdairy': 32,
 '6Africanbeer': 20,
 '7Mosaicbeer': 21,
 '8Mixedorigin': 72,
 '9Mexicanagave': 7,
 'M1Mosaicregion1': 17,
 'M2Mosaicregion2': 20,
 'M3Mosaicregion3': 113}

In [5]:
#load raw dxy data

all_dxy = pd.read_csv('/Users/clairedubin/sacc/carly_genes/dxy_1011pops_NASpar.csv', header=None)
all_dxy.columns = ['population', 'gene', 'dxy', 'spar_strain_count', 'scer_strain_count']
all_dxy.head()

Unnamed: 0,population,gene,dxy,spar_strain_count,scer_strain_count
0,17Taiwanese,YLR457C,0.127083,10,3
1,19Malaysian,YLR457C,0.127083,10,6
2,14CHNIII,YLR457C,0.128125,10,2
3,15CHNII,YLR457C,0.126562,10,2
4,18FarEastAsia,YLR457C,0.129398,10,9


In [9]:
all_dxy[all_dxy['population']=='1WineEuropean']['dxy'].mean()

0.10360336791554094

In [10]:
#drop any rows where spar_strain_count < 8 or scer_strain count < 75% of the population

all_dxy = all_dxy[all_dxy['spar_strain_count'] >= 8]

### Resample Dxy by Scer population

In [6]:
np.random.seed(67)

p_dict = {}

for pop in pop_dict:
    size = pop_dict[pop]
    
    df = all_dxy[all_dxy['population']==pop]
    df = df[df['scer_strain_count']>= .75*size]
    
    candidates = df[df['gene'].isin(gene_dict.keys())]
    
    print('')
    print('---- {} ----'.format(pop))

    print('missing: ', [gene_dict[i] for i in gene_dict.keys() if i not in candidates['gene'].tolist()])
    
    p = resample_med(candidates, df, 'dxy', essential_genes, direction='greater_than')
    p_dict[pop] = [df['dxy'].median(), candidates['dxy'].median(), df.shape[0], p]
    
    print('p = ', p)
    


---- 10FrenchGuianahuman ----
missing:  ['CEP3']
candidate gene median dxy: 0.1258814703675919
essential count: 5; nonessential_count: 8
resampling pool size: 4776
p = 0.0007
p =  0.0007

---- 11Alebeer ----
missing:  ['CEP3']
candidate gene median dxy: 0.1225870508912903
essential count: 5; nonessential_count: 8
resampling pool size: 4765
p = 0.0022
p =  0.0022

---- 12WestAfricancocoa ----
missing:  ['CEP3']
candidate gene median dxy: 0.1258008742741567
essential count: 5; nonessential_count: 8
resampling pool size: 4777
p = 0.0008
p =  0.0008

---- 13Africanpalmwine ----
missing:  ['CEP3']
candidate gene median dxy: 0.1265896087117017
essential count: 5; nonessential_count: 8
resampling pool size: 4765
p = 0.0007
p =  0.0007

---- 14CHNIII ----
missing:  ['CEP3']
candidate gene median dxy: 0.1253229974160206
essential count: 5; nonessential_count: 8
resampling pool size: 4780
p = 0.0024
p =  0.0024

---- 15CHNII ----
missing:  ['CEP3']
candidate gene median dxy: 0.1248708010335917


In [7]:
with open('dxy_p_vals_by_scer_pop_NAspar.csv', 'w') as f:
    w = csv.writer(f, delimiter=',')
    w.writerow(['S. cerevisiae population', 'Genomic median Dxy', 'Candidate genes median Dxy', 'Resampling pool size', 'Resampling p value'])

    for pop in p_dict:
        w.writerow([pop]+p_dict[pop])