# Merge experimental samples and analyze differences in fixed mutations

## Merge samples into a single df and store
 - To be run after generating allele frequencies using the variant_count.py and allele_freqs.py script

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import sample as random_sample

%matplotlib inline

In [14]:
strains = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','H1','H2','H3','H4','H5','H6','H7','H8','H9','H10']

#load first strain
alleles = pd.read_csv('/Users/chrisgraves/Documents/Yeast_data/Sequencing/saved_dfs/C1_freq.csv')
alleles['Treatment'] = 'C'
alleles['Strain'] = 1

#append remaining strains
for strain in strains[1:len(strains)]:
    strain_df = pd.read_csv('/Users/chrisgraves/Documents/Yeast_data/Sequencing/saved_dfs/'+strain+'_freq.csv')
    strain_df['Treatment'] = strain[0]
    if len(strain) ==2:
        strain_df['Strain'] = int(strain[1])
    elif len(strain) ==3:
        strain_df['Strain'] = int(strain[1:3])
    alleles = alleles.append(strain_df,ignore_index=True)
    
    
num_alleles = alleles.shape[0]/6
print('There are %d alleles' %num_alleles)
alleles.head()

There are 64499 alleles


Unnamed: 0,Chromosome,Position,Alt,Time,Ref,Is_repeat,Read_depth,Num_ref,Num_alt,Freq_ref,Freq_alt,Qual_ref,Qual_alt,Treatment,Strain
0,I,5068,ATTTTTTTTTTTTTTC,1,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1
1,I,5068,ATTTTTTTTTTTTTTC,3,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1
2,I,5068,ATTTTTTTTTTTTTTC,5,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1
3,I,5068,ATTTTTTTTTTTTTTC,7,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1
4,I,5068,ATTTTTTTTTTTTTTC,9,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1


In [6]:
#export dataframe of raw, unfiltered variants
alleles.to_csv('/Users/chrisgraves/Documents/Yeast_data/Sequencing/saved_dfs/combined_alleles.csv',index=False)

# Analyze fixed differences between strains

In [9]:
#import ancestor df

parent_df = pd.read_csv('/Users/chrisgraves/Documents/Yeast_data/Sequencing/saved_dfs/parent_df.csv')
parent_df.head()

Unnamed: 0,Chromosome,Position,Ref,Alt,Ref_count,Alt_count,Ref_freq,Alt_freq,N_calls
0,I,10,A,C,4,2,0.666667,0.333333,6
1,I,6736,CAAAAAAAAAAAAAAAAAAAT,CAAAAAAAAAAAAAAAAAAAAAT,0,10,0.0,1.0,10
2,I,12594,A,T,144,37,0.79558,0.20442,181
3,I,12690,A,T,130,66,0.663265,0.336735,196
4,I,19971,CT,CCTT,0,131,0.0,1.0,131


In [15]:
#determine fixed alleles in ancestor

fixed = parent_df[parent_df['Alt_freq'] > 0.9]
num_fixed = fixed.shape[0]
print('There are %d fixed alleles in ancestor' %num_fixed)
fixed.head()

There are 200 fixed alleles in ancestor


Unnamed: 0,Chromosome,Position,Ref,Alt,Ref_count,Alt_count,Ref_freq,Alt_freq,N_calls
1,I,6736,CAAAAAAAAAAAAAAAAAAAT,CAAAAAAAAAAAAAAAAAAAAAT,0,10,0.0,1.0,10
4,I,19971,CT,CCTT,0,131,0.0,1.0,131
7,I,25340,C,A,0,96,0.0,1.0,96
8,I,25488,G,A,1,24,0.04,0.96,25
11,I,25536,G,A,0,4,0.0,1.0,4


In [16]:
#remove fixed alleles from experimental samples
variant_groups = alleles.groupby(['Chromosome','Position','Alt'])

filtered_variants = alleles
for row in fixed.itertuples():
    chrom = row.Chromosome
    pos = row.Position
    alt = row.Alt
    try:
        var_group = variant_groups.get_group((chrom,pos,alt))
    except KeyError:
        var_group = pd.DataFrame(columns = ['Chromosome','Position','Ref','Alt','Ref_count','Alt_count','Ref_freq','Alt_freq','N_calls'])
    
    if var_group.shape[0] != 0:
        filtered_variants = filtered_variants[~((filtered_variants['Chromosome']==chrom)&(filtered_variants['Position']==pos)&(filtered_variants['Alt']==alt))]
        
        
new_num_alleles = filtered_variants.shape[0]/6
print('There are %d alleles, %d were removed by filter' %(new_num_alleles,num_alleles-new_num_alleles))

There are 60815 alleles, 3684 were removed by filter


In [18]:
filtered_variants.head()

Unnamed: 0,Chromosome,Position,Alt,Time,Ref,Is_repeat,Read_depth,Num_ref,Num_alt,Freq_ref,Freq_alt,Qual_ref,Qual_alt,Treatment,Strain
0,I,5068,ATTTTTTTTTTTTTTC,1,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1
1,I,5068,ATTTTTTTTTTTTTTC,3,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1
2,I,5068,ATTTTTTTTTTTTTTC,5,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1
3,I,5068,ATTTTTTTTTTTTTTC,7,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1
4,I,5068,ATTTTTTTTTTTTTTC,9,ATTTTTTTTTTTTTC,1,,,0,,0,,,C,1


In [24]:
#Check for fixed alleles that weren't observed in ancestor
fixed_alleles = filtered_variants.groupby(('Chromosome','Position','Alt')).filter(lambda x: x['Freq_alt'].sum()>110)
num_fixed = fixed_alleles.shape[0]/6
print('There are %d fixed alleles not observed in ancestor' %num_fixed)
fixed_alleles

There are 40 fixed alleles not observed in ancestor


Unnamed: 0,Chromosome,Position,Alt,Time,Ref,Is_repeat,Read_depth,Num_ref,Num_alt,Freq_ref,Freq_alt,Qual_ref,Qual_alt,Treatment,Strain
102,I,25497,A,1,G,0,6,0,6,0.000000,1.000000,0.000000,37.333333,C,1
103,I,25497,A,3,G,0,23,3,20,0.130435,0.869565,31.000000,32.350000,C,1
104,I,25497,A,5,G,0,23,1,22,0.043478,0.956522,31.000000,34.227273,C,1
105,I,25497,A,7,G,0,24,1,23,0.041667,0.958333,40.000000,30.478261,C,1
106,I,25497,A,9,G,0,16,0,16,0.000000,1.000000,0.000000,35.812500,C,1
107,I,25497,A,12,G,0,9,1,8,0.111111,0.888889,40.000000,34.000000,C,1
7074,VI,97864,CTTTTTTTTTTTTG,1,CTTTTTTTTTTG,1,18,0,12,0.000000,1.000000,0.000000,34.333333,C,1
7075,VI,97864,CTTTTTTTTTTTTG,3,CTTTTTTTTTTG,1,67,0,53,0.000000,1.000000,0.000000,33.245283,C,1
7076,VI,97864,CTTTTTTTTTTTTG,5,CTTTTTTTTTTG,1,57,0,43,0.000000,1.000000,0.000000,32.767442,C,1
7077,VI,97864,CTTTTTTTTTTTTG,7,CTTTTTTTTTTG,1,42,0,39,0.000000,1.000000,0.000000,34.307692,C,1


There are two alleles that were fixed in all experimental samples but not identified as fixed in the ancestor

In [26]:
parent_df[(parent_df['Chromosome']=='I') & (parent_df['Position']==25497) & (parent_df['Alt']=='A')]

Unnamed: 0,Chromosome,Position,Ref,Alt,Ref_count,Alt_count,Ref_freq,Alt_freq,N_calls
9,I,25497,G,A,4,17,0.190476,0.809524,21


In [27]:
parent_df[(parent_df['Chromosome']=='VI') & (parent_df['Position']==97864) & (parent_df['Alt']=='CTTTTTTTTTTTTG')]

Unnamed: 0,Chromosome,Position,Ref,Alt,Ref_count,Alt_count,Ref_freq,Alt_freq,N_calls
282,VI,97864,CTTTTTTTTTTG,CTTTTTTTTTTTTG,0,32,0,0.727273,44


Both alleles are present in ancestor at high frequency, probably not observed at 100% frequency due to sequencing errors

## Identify and remove alleles present at high frequencies in all samples

In [92]:
#remove all alleles observed in parent strain
unfiltered = alleles.shape[0]/6
variant_groups = alleles.groupby(['Chromosome','Position','Alt'])

filtered_variants = alleles
for row in parent_df.itertuples():
    chrom = row.Chromosome
    pos = row.Position
    alt = row.Alt
    try:
        var_group = variant_groups.get_group((chrom,pos,alt))
    except KeyError:
        var_group = pd.DataFrame(columns = ['Chromosome','Position','Ref','Alt','Ref_count','Alt_count','Ref_freq','Alt_freq','N_calls'])
    
    if var_group.shape[0] != 0:
        filtered_variants = filtered_variants[~((filtered_variants['Chromosome']==chrom)&(filtered_variants['Position']==pos)&(filtered_variants['Alt']==alt))]
        
        
after_filter1 = filtered_variants.shape[0]/6
print('There are %d alleles, %d were removed by filter' %(after_filter1,unfiltered-after_filter1))

There are 53905 alleles, 10594 were removed by filter


In [95]:
#Reduce stringency on fixation criteria
fixed_alleles = filtered_variants.groupby(('Chromosome','Position','Alt')).filter(lambda x: x['Freq_alt'].sum()>40)
num_fixed = fixed_alleles.shape[0]/6
print('There are %d fixed alleles not fixed in ancestor' %num_fixed)

There are 189 fixed alleles not observed in ancestor


In [96]:
#Check alleles not observed in all strains
not_in_all = fixed_alleles.groupby(('Chromosome','Position','Alt')).filter(lambda x: len(x)!=120)
len(not_in_all.groupby(('Chromosome','Position','Alt')))

3

Only 3 of the high frequency alleles are not present in all time-points of all strains

In [97]:
#for those not present in all strains, check if they belong to only a small number of strains
for name, group in not_in_all.groupby(('Chromosome','Position','Alt')):
    print(float(len(group))/120)

0.9
0.9
0.65


In [98]:
not_in_all

Unnamed: 0,Chromosome,Position,Alt,Time,Ref,Is_repeat,Read_depth,Num_ref,Num_alt,Freq_ref,Freq_alt,Qual_ref,Qual_alt,Treatment,Strain
1434,II,221290,A,1,T,0,,,0,,0.000000,,,C,1
1435,II,221290,A,3,T,0,4,0,4,0.000000,1.000000,0.000000,34.250000,C,1
1436,II,221290,A,5,T,0,3,0,3,0.000000,1.000000,0.000000,35.000000,C,1
1437,II,221290,A,7,T,0,5,1,4,0.200000,0.800000,2.000000,38.250000,C,1
1438,II,221290,A,9,T,0,3,0,3,0.000000,1.000000,0.000000,36.666667,C,1
1439,II,221290,A,12,T,0,,,0,,0.000000,,,C,1
8676,VIII,21,CCA,1,ACA,0,3,0,3,0.000000,1.000000,0.000000,18.666667,C,1
8677,VIII,21,CCA,3,ACA,0,12,1,0,0.200000,0.000000,0.000000,18.666667,C,1
8678,VIII,21,CCA,5,ACA,0,17,0,0,0.000000,0.000000,0.000000,18.666667,C,1
8679,VIII,21,CCA,7,ACA,0,10,0,0,0.000000,0.000000,0.000000,18.666667,C,1


In [101]:
#check random sets of high frequency, putatively fixed alleles to confirm that they are fixed across entire time-series

hf = fixed_alleles.groupby(('Chromosome','Position','Alt')).filter(lambda x: x['Freq_alt'].sum() < 100)
hf_groups = hf.groupby(('Chromosome','Position','Alt'))
rand_key = random_sample(hf_groups.indices.keys(),1)
rand_group = hf_groups.get_group(rand_key[0])
num_samples = rand_group.shape[0]
print('Found in %d of 120 samples' %num_samples)
rand_group


Found in 120 of 120 samples


Unnamed: 0,Chromosome,Position,Alt,Time,Ref,Is_repeat,Read_depth,Num_ref,Num_alt,Freq_ref,Freq_alt,Qual_ref,Qual_alt,Treatment,Strain
14274,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,1,CTTTTTTTTTTTTTTTTTTTA,1,3,0,3,0.000000,1.000000,0.000000,21.000000,C,1
14275,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,3,CTTTTTTTTTTTTTTTTTTTA,1,9,0,4,0.000000,1.000000,0.000000,11.000000,C,1
14276,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,5,CTTTTTTTTTTTTTTTTTTTA,1,16,5,0,0.555556,0.000000,0.000000,11.000000,C,1
14277,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,7,CTTTTTTTTTTTTTTTTTTTA,1,7,0,4,0.000000,1.000000,0.000000,34.000000,C,1
14278,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,9,CTTTTTTTTTTTTTTTTTTTA,1,9,2,4,0.333333,0.666667,28.000000,13.500000,C,1
14279,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,12,CTTTTTTTTTTTTTTTTTTTA,1,5,1,2,0.333333,0.666667,38.000000,21.500000,C,1
33966,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,1,CTTTTTTTTTTTTTTTTTTTA,1,,,0,,0.000000,,,C,2
33967,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,3,CTTTTTTTTTTTTTTTTTTTA,1,6,1,3,0.250000,0.750000,39.000000,20.333333,C,2
33968,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,5,CTTTTTTTTTTTTTTTTTTTA,1,6,1,3,0.250000,0.750000,38.000000,33.000000,C,2
33969,XIII,918120,CTTTTTTTTTTTTTTTTTTTTA,7,CTTTTTTTTTTTTTTTTTTTA,1,5,1,2,0.333333,0.666667,38.000000,21.500000,C,2


## Filter out high frequency sequencing errors and fixed mutations

In [102]:
#remove all other fixed alleles and high frequency sequencing errors
fixed_removed = filtered_variants.groupby(('Chromosome','Position','Alt')).filter(lambda x: (x['Freq_alt'].sum()<=40) | (len(x)!=120))
after_filter2 = fixed_removed.shape[0]/6
print('There are %d alleles, %d removed by filter alleles' %(after_filter2, after_filter1-after_filter2))

fixed_removed.to_csv('/Users/chrisgraves/Documents/Yeast_data/Sequencing/saved_dfs/combined_filtered.csv',index=False)

There are 53765 alleles, 140 removed by filter alleles


## Conclusions
- All alleles found at high frequency at all time-points are observed in all experimental samples and parent. Discrepencies appear consistent with seqeuncing errors causing artificially low frequencies in samples with low coverage.
- Even thresholding with relatively low values to define fixed alleles results in alleles that are present at high frequencies in all strains and can therefore be confidentally labeled as fixed.
- Alleles identified at intermediate freqeuncies tend to fall into repeat regions or for other reasons look like high frequency sequencing errors