Code adapted from Jeremy Roop

In [1]:
#imports
from pyfasta import Fasta
import os
import pandas as pd
import numpy as np

In [2]:
#dxy function for one gene
good_nucs = ['A', 'T', 'C', 'G', 'a', 't', 'c', 'g']

def dxy(gene_file_name, main):
    spar_seqs, scer_seqs = [], []
    dxy = 0
    spar_strain_count, scer_strain_count = 0, 0
   
    try:
        f = Fasta(main +gene_file_name)
        keys = f.keys()

        for key in f:
            
            if (key[0:4] == 'Spar' and 'N-44' in key) or (key[0:4] == 'Spar' and 'YPS138' in key):
                #these arent in the European population
                continue
            
            gaps = f[key][:].count('-') + f[key][:].count('N') + f[key][:].count('n')
            
            if gaps/len(f[key][:]) >= 0.05:
                continue
            

            if key[0:4] == 'Spar':
                
                spar_strain_count += 1
                spar_seqs += [f[key][:]]

            elif key[0:4] == 'Scer':
                scer_strain_count += 1
                scer_seqs += [f[key][:]]

                
        if spar_strain_count < 8 or scer_strain_count < 10:
            return
                
        for spar_seq in spar_seqs:
            for scer_seq in scer_seqs:
                for i in range(len(spar_seq)):
                    if scer_seq[i] != spar_seq[i] and scer_seq[i] in good_nucs and spar_seq[i] in good_nucs:
                        dxy += 1

        dxy = dxy/(spar_strain_count * scer_strain_count)
        return dxy/len(spar_seqs[0])
    
    except:
        return 'NaN'


In [3]:
# #run dxy on directory

directory = '/Users/clairedubin/spur/publishable_data/alignments/scer_spar_MUSCLE_alignments/'

all_files = os.listdir(directory)

for file in all_files:
    if file[-4:] == 'flat' or file[-3:] == 'gdx':
        os.remove(directory + file)

all_files = os.listdir(directory)
        
dxy_dict = {}
for file in all_files:
    dxy_dict[file] = dxy(file)
    print(file, dxy_dict[file])
    
    
    
for i in dxy_dict:
    dxy_dict[i] = [dxy_dict[i]]

df = pd.DataFrame.from_dict(dxy_dict, orient='index')
df = df.rename(columns={0:'dxy'})
df = df[~df['dxy'].isnull()]
df.sort_values('dxy', ascending=False)

df.to_csv('/Users/clairedubin/spur/publishable_data/raw_data/dxy_raw_091719_more_filters.csv')

df = pd.read_csv('/Users/clairedubin/spur/publishable_data/raw_data/dxy_raw_091719_more_filters.csv')
df = df.sort_values('dxy', ascending=False).set_index('Unnamed: 0')
new_index = [i[2] for i in df.index.str.split('_')]
df.index = new_index
df.to_csv('/Users/clairedubin/spur/publishable_data/raw_data/dxy_raw_091719_more_filters.csv')


In [4]:
dxy_all = pd.read_csv('/Users/clairedubin/spur/publishable_data/raw_data/dxy_raw_091719_more_filters.csv')
all_dxy_genes = dxy_all['Unnamed: 0']

In [None]:
#drop anything not used in go term analysis

#load go term data as go_terms, remove 3 broad go terms
#source: http://geneontology.org/docs/download-go-annotations/

go_terms = pd.read_csv('/Users/clairedubin/spur/publishable_data/external_datasets/go_terms.csv', header=None)
go_terms = go_terms.drop(columns=[0, 1,3,5,6,7,8,11,12,13,14,15])
go_terms = go_terms.rename(columns={2: 'sgd_name', 4:'go_term', 9:'gene_desc', 10:'gene'})
go_terms['gene'] = [i[0] for i in go_terms['gene'].str.split('|')]
go_terms = go_terms.drop_duplicates()
go_terms = go_terms[~go_terms['go_term'].isin(['GO:0005575', 'GO:0008150', 'GO:0003674'])]
go_terms = go_terms.set_index('gene')
go_terms.head()

In [None]:
dxy_all = dxy_all[dxy_all['Unnamed: 0'].isin(go_terms.index)]
dxy_all