In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
#essential genes + GO term info
essential = pd.read_csv('/Users/clairedubin/sacc/external_datasets/essential.csv', header=None)
essential[1] = essential[1].str.strip('\t')
essential_genes = essential[1].str.strip(' ').tolist()

#load go term data as go_terms, remove 3 broad go terms
#source: http://geneontology.org/docs/download-go-annotations/
go_terms = pd.read_csv('/Users/clairedubin/sacc/external_datasets/go_terms.csv', header=None)
go_terms = go_terms.drop(columns=[0, 1,3,5,6,7,8,11,12,13,14,15])
go_terms = go_terms.rename(columns={2: 'sgd_name', 4:'go_term', 9:'gene_desc', 10:'gene'})
go_terms['gene'] = [i[0] for i in go_terms['gene'].str.split('|')]
go_terms = go_terms.drop_duplicates()
go_terms = go_terms[~go_terms['go_term'].isin(['GO:0005575', 'GO:0008150', 'GO:0003674'])]
go_terms = go_terms.set_index('gene')

In [3]:
def resample(df1, df2, n=10000, graph=False):
    
    results = 0

    actual_med = df1['dxy'].median()
    
    essential_count = len([i for i in df1['gene'] if i in essential_genes])
    nonessential_count = len(df1['gene']) - essential_count
    
    print('candidate gene median dxy: ', actual_med)
    print('essential count: '+str(essential_count)+', nonessential_count: '+str(nonessential_count))
    print('resampling pool size: ', df2.shape[0])
    
    essential_df = df2[df2['gene'].isin(essential_genes)]
    nonessential_df =  df2[~df2['gene'].isin(essential_genes)]

    sample_meds = []
    
    for _ in range(n):
        sample = essential_df.sample(n=essential_count)
        sample = sample.append(nonessential_df.sample(n=nonessential_count))
        sample_med = sample['dxy'].median()
        sample_meds += [sample_med]
        if sample_med >= actual_med:
            results += 1
    
    if graph:
        
        print('p=',results/n)
        sns.distplot(sample_meds)
        plt.axvline(x=actual_med, color='r', label = 'True median')
        plt.xlabel('Sample medians')
        plt.ylabel('Frequency')
    
    return results/n        

In [4]:
gene_dict = {'YDR180W':'SCC2',
             'YMR168C':'CEP3',
             'YCR042C': 'TAF2',
             'YGR198W':'YPP1',
             'YHR166C': 'CDC23',
             'YHR023W':'MYO1',
             'YLR397C': 'AFG2',
             'YKR054C': 'DYN1',
             'YPL174C':'NIP100',
             'YGR098C':'ESP1'                    
            }

In [5]:
#group Scer strains by population

pops = pd.read_csv('/Users/clairedubin/sacc/external_datasets/1011_pops_tableS1.csv', skiprows=3)
pops = pops[['Standardized name', 'Clades']].dropna()
pops['population'] = pops['Clades'].str.replace('[^\w]','')

pop_dict = pops.groupby('population')['Standardized name'].count().to_dict()
pop_dict

{'10FrenchGuianahuman': 31,
 '11Alebeer': 18,
 '12WestAfricancocoa': 13,
 '13Africanpalmwine': 28,
 '14CHNIII': 2,
 '15CHNII': 2,
 '16CHNI': 1,
 '17Taiwanese': 3,
 '18FarEastAsia': 9,
 '19Malaysian': 6,
 '1WineEuropean': 268,
 '1WineEuropeansubclade1': 18,
 '1WineEuropeansubclade2': 13,
 '1WineEuropeansubclade3': 24,
 '1WineEuropeansubclade4': 39,
 '20CHNV': 2,
 '21Ecuadorean': 10,
 '22FarEastRussian': 4,
 '23NorthAmericanoak': 13,
 '24Asianislands': 11,
 '25Sake': 47,
 '26Asianfermentation': 39,
 '2Alpechin': 17,
 '3Brazilianbioethanol': 35,
 '4Mediterraneanoak': 8,
 '5Frenchdairy': 32,
 '6Africanbeer': 20,
 '7Mosaicbeer': 21,
 '8Mixedorigin': 72,
 '9Mexicanagave': 7,
 'M1Mosaicregion1': 17,
 'M2Mosaicregion2': 20,
 'M3Mosaicregion3': 113}

In [11]:
#load raw dxy data

all_dxy = pd.read_csv('/Users/clairedubin/sacc/carly_genes/dxy_1011pops_EuroSpar.csv', header=None)
all_dxy.columns = ['population', 'gene', 'dxy', 'spar_strain_count', 'scer_strain_count']
all_dxy.head()

Unnamed: 0,population,gene,dxy,spar_strain_count,scer_strain_count
0,19Malaysian,YLR457C,0.131458,10,6
1,M1Mosaicregion1,YLR457C,0.134645,10,17
2,19Malaysian,YLR129W,0.09049,10,6
3,M2Mosaicregion2,YLR457C,0.134531,10,20
4,19Malaysian,YIL093C,0.072704,10,6


In [13]:
#drop any rows where spar_strain_count < 8 or scer_strain count < 75% of the population

all_dxy = all_dxy[all_dxy['spar_strain_count'] >= 8]
all_dxy.shape

(150755, 5)

In [14]:
np.random.seed(777)

p_dict = {}

for pop in pop_dict:
    size = pop_dict[pop]
    
    df = all_dxy[all_dxy['population']==pop]
    df = df[df['scer_strain_count']>= .75*size]
    
    candidates = df[df['gene'].isin(gene_dict.keys())]
    
    print('')
    print('---- {} ----'.format(pop))

    print('missing: ', [gene_dict[i] for i in gene_dict.keys() if i not in candidates['gene'].tolist()])
    
    p = resample(candidates, df)
    p_dict[pop] = [df['dxy'].median(), candidates['dxy'].median(), df.shape[0], p]
    
    print('p = ', p)
    


---- 10FrenchGuianahuman ----
missing:  []
candidate gene median dxy:  0.1161534653932147
essential count: 7, nonessential_count: 3
resampling pool size:  4570
p =  0.0149

---- 11Alebeer ----
missing:  []
candidate gene median dxy:  0.11360373645692373
essential count: 7, nonessential_count: 3
resampling pool size:  4562
p =  0.0172

---- 12WestAfricancocoa ----
missing:  []
candidate gene median dxy:  0.11350244394617938
essential count: 7, nonessential_count: 3
resampling pool size:  4569
p =  0.0217

---- 13Africanpalmwine ----
missing:  []
candidate gene median dxy:  0.11576331774430337
essential count: 7, nonessential_count: 3
resampling pool size:  4562
p =  0.0151

---- 14CHNIII ----
missing:  []
candidate gene median dxy:  0.11565000073633827
essential count: 7, nonessential_count: 3
resampling pool size:  4574
p =  0.016

---- 15CHNII ----
missing:  []
candidate gene median dxy:  0.1135875914608919
essential count: 7, nonessential_count: 3
resampling pool size:  4573
p =  0.

In [15]:
with open('dxy_by_scer_pop_Eurospar.csv', 'w') as f:
    w = csv.writer(f, delimiter=',')
    w.writerow(['S. cerevisiae population', 'Genomic median Dxy', 'Candidate genes median Dxy', 'Resampling pool size', 'Resampling p value'])

    for pop in p_dict:
        w.writerow([pop]+p_dict[pop])