In [67]:
from Bio import SeqIO
import csv
import os
import pandas as pd
from pyfasta import Fasta

### 1011 Genomes S. cerevisiae ORFs (Peter, et al. 2018)

Write files for each gene with only strains in a given population

In [61]:
#group Scer strains by population

pops = pd.read_csv('/Users/clairedubin/sacc/external_datasets/1011_pops_tableS1.csv', skiprows=3)
pops = pops[['Standardized name', 'Clades']].dropna()
pops['clade'] = pops['Clades'].str.replace('[^\w]','')

pop_dict = pops.groupby('clade')['Standardized name'].apply(list).to_dict()
pop_dict

gene_dict = pops.groupby('Standardized name')['clade'].apply(list).to_dict()
gene_dict

{'AAA': ['1WineEuropean'],
 'AAB': ['7Mosaicbeer'],
 'AAC': ['11Alebeer'],
 'AAD': ['25Sake'],
 'AAE': ['1WineEuropeansubclade4'],
 'AAG': ['M3Mosaicregion3'],
 'AAI': ['1WineEuropean'],
 'AAK': ['1WineEuropeansubclade4'],
 'AAL': ['8Mixedorigin'],
 'AAM': ['8Mixedorigin'],
 'AAN': ['8Mixedorigin'],
 'AAP': ['8Mixedorigin'],
 'AAQ': ['7Mosaicbeer'],
 'AAR': ['7Mosaicbeer'],
 'AAS': ['M3Mosaicregion3'],
 'AAT': ['M3Mosaicregion3'],
 'AAV': ['M3Mosaicregion3'],
 'ABA': ['M3Mosaicregion3'],
 'ABB': ['26Asianfermentation'],
 'ABC': ['M3Mosaicregion3'],
 'ABD': ['M3Mosaicregion3'],
 'ABE': ['1WineEuropean'],
 'ABF': ['M3Mosaicregion3'],
 'ABG': ['M3Mosaicregion3'],
 'ABH': ['M3Mosaicregion3'],
 'ABI': ['M3Mosaicregion3'],
 'ABK': ['M3Mosaicregion3'],
 'ABM': ['2Alpechin'],
 'ABP': ['1WineEuropean'],
 'ABQ': ['1WineEuropean'],
 'ABR': ['1WineEuropean'],
 'ABS': ['1WineEuropean'],
 'ABT': ['1WineEuropean'],
 'ABV': ['8Mixedorigin'],
 'ACA': ['M3Mosaicregion3'],
 'ACB': ['M3Mosaicregion3'],
 '

In [72]:
#make Scer files by population
scer_outpath = '/Users/clairedubin/sacc/carly_genes/alignments/1011pops_unaligned/'
all_genes = '/Users/clairedubin/sacc/external_datasets/allReferenceGenesWithSNPsAndIndelsInferred/'

for clade in pop_list:
    
    if not os.path.exists(scer_outpath+clade):
        os.mkdir(scer_outpath+clade)
        
for file in os.listdir(all_genes):
    
    gene = file.strip('.fasta')
    for record in SeqIO.parse(all_genes+file, 'fasta'):
        strain = record.description.split('_'+gene)[0]
        
        if strain not in gene_dict:
            continue
        
        clade = gene_dict[strain][0]
        
        f = open(scer_outpath+clade+'/'+gene+'.fa', 'a')
        f.write('>'+record.description+'\n'+str(record.seq)+'\n')
        f.close()
        

In [71]:
to_remove = []

for clade in os.listdir(scer_outpath):
    popsize = len(pop_dict[clade])
    
    for file in os.listdir(scer_outpath+clade):        
        if len([r for r in SeqIO.parse(scer_outpath+clade+'/'+file, 'fasta')]) < 0.75*popsize:
            to_remove += [scer_outpath+clade+'/'+file]

to_remove

[]

### S. paradoxus European population - Bergstrom, et al. 2014

Write file for each gene using annotations and assemblies

In [79]:
#use only European population strains
spar_euro_strains = ['Q59.1', 
                'Z1.1',
                'Q95.3',
                'S36.7', 
                'Y9.6', 
                'W7', 
                'Y8.5', 
                'Z1', 
                'T21.4',
                'Y7',
                'Y6.5']

In [91]:
all_assemblies = '/Users/clairedubin/sacc/external_datasets/SGRP2-assemblies_Jun25/'
all_annotations = '/Users/clairedubin/sacc/external_datasets/orf-annotation-gffs-SGRP2_Jun25/'
spar_annotartions = '/Users/clairedubin/sacc/external_datasets/Spar.gff.txt'

euro_spar_output_dir = '/Users/clairedubin/sacc/carly_genes/alignments/BergstromSpar_ORFs_unaligned/'

#map genes to homologs
gene_dict = {}
spar_annotations = '/Users/clairedubin/sacc/external_datasets/Spar.gff.txt'

f = open(spar_annotations, 'r')
for line in f.readlines():
    gene = line.split('SGD=')[1].split(';')[0]
    homolog = line.split('Gene=')[1].split(';')[0]
    gene_dict[homolog] = gene
f.close()    
    
for file in os.listdir(all_annotations):
    
    if '_Sc_' in file:
        continue
    
    suffix = file.split('orf-annotation_')[1][:-4]
    strain = suffix.split('_Sp_')[1].strip('SGRP2-assembly.gff')
    
    if strain in spar_euro_strains:
        f = open(all_annotations+file, 'r')
        annotation_dict = {}
        for i, line in enumerate(f.readlines()):  
            homolog = line.split('homolog=')[1].strip('\n')
            if homolog in gene_dict and gene_dict[homolog]:
                gene = gene_dict[homolog]
                split = line.split('\t')
                scaff, start, stop, direction = split[0], split[3], split[4], split[6]
                annotation_dict[gene] = [scaff, start, stop, direction]   
        f.close()
                
        f=Fasta(all_assemblies+suffix+'.fa')   
        for gene in annotation_dict:            
            chrom, start, stop, direction = annotation_dict[gene]
            seq = f.sequence({'chr': chrom, 'start': int(start), 'stop': int(stop), 'strand': direction})
            to_write = '>' + gene + '_Sp_' + strain + '\n' + seq + '\n'

            if len(seq) > 50:
                newfile=open(euro_spar_output_dir+gene+'.fa', 'a')
                newfile.write(to_write)
                newfile.close()


### S. paradoxus North American subpopulation B - Durand, et al. 2011

Write file for each gene based on assemblies and annotations for strains in subpopulation B (largest subpop)

In [92]:
#parse populations
pop_info = '/Users/clairedubin/sacc/external_datasets/durand_spar_NA_raw/infos_lib.txt'

NA_spar_pops = {'SpA':[], 'SpB':[], 'SpC': []}

f = open(pop_info, 'r')
for line in f.readlines():
    if 'SpA' in line:
        NA_spar_pops['SpA'] += [line.split('\t')[1].replace('S','')]
    elif 'SpB' in line:
        NA_spar_pops['SpB'] += [line.split('\t')[1].replace('S','')]
    elif 'SpC' in line:
        NA_spar_pops['SpC'] += [line.split('\t')[1].replace('S','')]
        
NA_spar_pops

{'SpA': ['A01', 'B01', 'C01', 'B02', 'D01'],
 'SpB': ['B06', 'D06', 'C05', 'C04', 'D04', 'D05', 'B05', 'A05', 'C06', 'A06'],
 'SpC': ['C03', 'A04', 'B03', 'B04', 'A03', 'A02', 'D02', 'D03', 'C02']}

In [94]:
NA_spar_output_dir = '/Users/clairedubin/sacc/carly_genes/alignments/DurandSparB_ORFs_unaligned/'
all_assemblies = '/Users/clairedubin/sacc/external_datasets/durand_spar_NA_raw/fasta_genomes/'
all_annotations = '/Users/clairedubin/sacc/external_datasets/durand_spar_NA_raw/gff_genes/1_gff_genes_coord/'

for strain in [i for i in os.listdir(all_annotations) if i.split('_')[0] in NA_spar_pops['SpB']]:

    f = open(all_annotations+strain, 'r')
    annotation_dict = {}

    for i, line in enumerate(f.readlines()):
        if 'gene' in line:
            gene = line.split('gene_id=')[1].strip('\n')
            if 'unknown' in gene: 
                continue
            split = line.split('\t')
            scaff, start, stop, direction = split[0], split[3], split[4], split[6]
            annotation_dict[gene] = [scaff, start, stop, direction]
    f.close()

    strain = strain.strip('_genes_aug.gff')
    f=Fasta(all_assemblies+strain+'_genome_200.fasta')       

    for gene in annotation_dict:
        chrom, start, stop, direction = annotation_dict[gene]
        seq = f.sequence({'chr': chrom, 'start': int(start), 'stop': int(stop), 'strand': direction})

        to_write ='>' + gene + '_Sp_' + strain + '\n' + seq + '\n'

        if len(seq) > 50:
            newfile=open(NA_spar_output_dir+gene+'.fa', "a")
            newfile.write(to_write)
            newfile.close()


### Merging S. paradoxus European population with each S. cerevisiae population

In [97]:
scer_eurospar_outdir = '/Users/clairedubin/sacc/carly_genes/alignments/1011pops_EuroSpar_unaligned/'

for clade in pop_dict:
        
    if not os.path.exists(scer_eurospar_outdir+clade):
        os.mkdir(scer_eurospar_outdir+clade)
    
    for file in os.listdir(scer_outpath+clade):
        
        if os.path.exists(euro_spar_output_dir+file):
                        
            sp = open(euro_spar_output_dir+file, 'r')
            sp_data = sp.read()
            sp.close()
            
            sc = open(scer_outpath+clade+'/'+file, 'r')
            sc_data = sc.read()
            sc.close()
            
            sc_sp = open(scer_eurospar_outdir+clade+'/'+file, 'a')
            sc_sp.write(sc_data)
            sc_sp.write(sp_data)
            sc_sp.close()        

### Merging S. paradoxus North American population with each S. cerevisiae population

In [98]:
scer_NAspar_outdir = '/Users/clairedubin/sacc/carly_genes/alignments/1011pops_NASpar_unaligned/'

for clade in pop_dict:
        
    if not os.path.exists(scer_NAspar_outdir+clade):
        os.mkdir(scer_NAspar_outdir+clade)
    
    for file in os.listdir(scer_outpath+clade):
        
        if os.path.exists(NA_spar_output_dir+file):
                        
            sp = open(NA_spar_output_dir+file, 'r')
            sp_data = sp.read()
            sp.close()
            
            sc = open(scer_outpath+clade+'/'+file, 'r')
            sc_data = sc.read()
            sc.close()
            
            sc_sp = open(scer_NAspar_outdir+clade+'/'+file, 'a')
            sc_sp.write(sc_data)
            sc_sp.write(sp_data)
            sc_sp.close()   