In [None]:
import os 
import pandas as pd 
import numpy as np 
import glob
import time

Goal: gather pertinent info from hyphy log files, make into a single dataframe with proportion of codons identified to be under positive selection. Each software produces a slightly different log file, so define functions for each one to correctly parse..

Start with MEME analysis 

In [None]:
#wrangle logs for all branches and just internal branches
meme_logs=glob.glob(os.path.join('/global/scratch/users/chandlersutherland/e14/popgen/clades/*/hyphy_meme.log'))
meme_internal_logs=glob.glob(os.path.join('/global/scratch/users/chandlersutherland/e14/popgen/clades/*/hyphy_meme_internal.log'))
len(meme_internal_logs)

In [None]:
#define a function that takes in the file path, and returns the row index of the sites df in the log file if sites are under selection 
def number_hunter_meme(file_path):
    with open(file_path, 'r') as file: 
        file_contents=file.readlines()
        for row in file_contents: 
            if row.find('Empty File') != -1:
                print('Error, empty file. Probably no tree file.')
                start=np.nan
                end=np.nan
                codon_len=np.nan
            elif row.find('Loaded a multiple') != -1:
                codon_len=int(row.split('**')[3])
            elif row.find('### ** Found _0_') != -1:
                print('zero sites under positive selection')
                start=0
                end=0
            elif row.find('Episodic selection detected?') != -1:
                start=file_contents.index(row)
            elif row.find('### ** Found') != -1:
                end=file_contents.index(row)
            
    return(start, end, codon_len)

#define a function that takes in the row indices from number_hunter, the codon length from number_hunter, and the file path 
#this function 1) writes the positions to a csv in the clade directory and 2) returns that dataframe for processing 
def file_convert_meme(start, end, codon_len, file_path):
    positions=pd.DataFrame()
    clade=file_path.split('/')[-2]
    if np.isnan(start) == True:
        row_res={'clade':clade, 'codon_len':np.nan, 'codon':np.nan, 'parititon':np.nan, 'alpha':np.nan, 'beta':np.nan, 'LRT':np.nan, 'p':np.nan}
        positions=positions.append(row_res, ignore_index=True) #return a blank df if no tree file 
        print('error, no file')
    elif start==0:
        row_res={'clade':clade, 'codon_len':codon_len, 'codon':np.nan, 'parititon':np.nan, 'alpha':np.nan, 'beta':np.nan, 'LRT':np.nan, 'p':1}
        positions=positions.append(row_res, ignore_index=True)
        print('no positions under selection')
    else:
        with open(file_path, 'r') as file: 
            file_contents=file.readlines()
            for l_no, line in enumerate(file_contents[start+2:end-1]):
                line_string=line.split(' ')
                while("" in line_string):
                    line_string.remove("")
                codon=line_string[1]
                partition=line_string[3]
                alpha=line_string[5]
                beta=line_string[7]
                LRT=line_string[11]
                p=line_string[-4]
                if LRT == 'Yes,':
                    LRT=line_string[9]
                row_res={'clade':clade, 'codon_len':int(codon_len), 'codon':int(codon), 'parititon':int(partition), 'alpha':float(alpha), 'beta':float(beta), 'LRT':float(LRT), 'p':float(p)}
                positions=positions.append(row_res, ignore_index=True)
        output_csv=file_path.strip('hyphy_meme.log')+'hyphy_meme_pos.csv'
        positions.to_csv(output_csv)
    return positions

#finally a function that wraps these functions
def wrapper_meme(file_path):
    #file_path=meme_logs[i]
    index=number_hunter_meme(file_path)
    positions=file_convert_meme(index[0], index[1], index[2], file_path)
    return positions 

In [None]:
#apply to meme_logs and meme_internal logs 
meme=pd.DataFrame()
for i in meme_logs:
    res=wrapper_meme(i)
    meme=meme.append(res)

meme_counts=meme.groupby(['clade', 'codon_len'])['p'].apply(lambda x:(x<0.05).sum()).reset_index().rename(columns={'p':'meme_count_95'})
meme_counts['prop_meme_95']=meme_counts['meme_count_95']/meme_counts['codon_len']

meme_internal=pd.DataFrame()
for i in meme_internal_logs:
    res=wrapper_meme(i)
    meme_internal=meme_internal.append(res)
    
meme_internal_counts=meme_internal.groupby(['clade', 'codon_len'])['p'].apply(lambda x:(x<0.05).sum()).reset_index().rename(columns={'p':'meme_count_95'})
meme_internal_counts['prop_meme_internal_95']=meme_internal_counts['meme_count_95']/meme_internal_counts['codon_len']

meme_results=pd.merge(meme_counts[['clade', 'codon_len', 'prop_meme_95']], meme_internal_counts[['clade', 'prop_meme_internal_95']], on='clade')
meme_results.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/meme_results.csv')

Repeat for FEL results 

In [None]:
#get paths to fel files 
fel_internal_logs=glob.glob(os.path.join('/global/scratch/users/chandlersutherland/e14/popgen/clades/*/hyphy_fel_internal.log'))
fel_logs=glob.glob(os.path.join('/global/scratch/users/chandlersutherland/e14/popgen/clades/*/hyphy_fel.log'))

In [None]:
def number_hunter_fel(file_path):
    with open(file_path, 'r') as file: 
        file_contents=file.readlines()
        for row in file_contents: 
            if row.find('Empty File') != -1:
                print('Error, empty file. Probably no tree file.')
                start=np.nan
                end=np.nan
                codon_len=np.nan
                break
            elif row.find('Loaded a multiple') != -1:
                codon_len=int(row.split("**")[3])
            elif row.find('Found _0_ sites under pervasive positive diversifying and _0_ sites under negative selection') != -1:
                start=0
                end=0
            elif row.find('these sites are significant') != -1:
                start=file_contents.index(row)
            elif row.find('under pervasive positive') != -1:
                end=file_contents.index(row)
    return(start, end, codon_len)

def file_convert_fel(start, end, codon_len, file_path):
    positions=pd.DataFrame()
    clade=file_path.split('/')[-2]
    if np.isnan(start) == True:
        row_res={'clade':clade, 'codon_len':np.nan, 'codon':np.nan, 'parititon':np.nan, 'alpha':np.nan, 'beta':np.nan, 'LRT':np.nan, 'p':np.nan}
        positions=positions.append(row_res, ignore_index=True) #return a blank df if no tree file 
        #print('error, no file')
    elif start==0:
        row_res={'clade':clade, 'codon_len':codon_len, 'codon':np.nan, 'parititon':np.nan, 'alpha':np.nan, 'beta':np.nan, 'LRT':np.nan, 'p':1}
        positions=positions.append(row_res, ignore_index=True)
        #print('no positions under selection')
    else:
        with open(file_path, 'r') as file: 
            file_contents=file.readlines()
            for l_no, line in enumerate(file_contents[start+4:end-1]):
                line_string=line.split(' ')
                while("" in line_string):
                    line_string.remove("")
                codon=line_string[1]
                partition=line_string[3]
                alpha=line_string[5]
                beta=line_string[7]
                LRT=line_string[9]
                p=line_string[-2]
                direction=line_string[11]
                #if LRT == 'Yes,':
                #    LRT=line_string[9]
                row_res={'clade':clade, 'codon_len':int(codon_len), 'codon':int(codon), 'parititon':int(partition), 'alpha':float(alpha), 'beta':float(beta), 'LRT':float(LRT), 'direction':direction, 'p':float(p)}
                positions=positions.append(row_res, ignore_index=True)
        output_csv=file_path.strip('hyphy_fel_internal.log')+'hyphy_fel_internal_pos.csv'
        positions.to_csv(output_csv)
    return positions

def wrapper_fel(file_path):
    #file_path=fel_logs[i]
    index=number_hunter_fel(file_path)
    positions=file_convert_fel(index[0], index[1], index[2], file_path)
    return positions 

In [None]:
fel_internal=pd.DataFrame()
for i in fel_internal_logs:
    res=wrapper_fel(i)
    fel_internal=fel_internal.append(res)

fel=pd.DataFrame()
for i in fel_logs:
    #print(fel_internal_logs[i])
    res=wrapper_fel(i)
    fel=fel.append(res)

fel_internal_counts=fel_internal.groupby(['clade', 'codon_len', 'direction'])['p'].apply(lambda x:(x<0.05).sum()).reset_index().rename(columns={'p':'fel_count_95'})
fel_internal_counts['prop_fel_int_95']=fel_internal_counts['fel_count_95']/fel_internal_counts['codon_len']
fel_internal_counts=fel_internal_counts[['clade', 'direction', 'prop_fel_int_95']]

fel_counts=fel.groupby(['clade', 'codon_len', 'direction'])['p'].apply(lambda x:(x<0.05).sum()).reset_index().rename(columns={'p':'fel_count_95'})
fel_counts['prop_fel_95']=fel_counts['fel_count_95']/fel_counts['codon_len']
fel_counts[['clade', 'direction', 'prop_fel_95']]

fel_results=pd.merge(fel_internal_counts, fel_counts[['clade', 'direction', 'prop_fel_95']], on=['clade', 'direction'])
fel_results.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/fel_results.csv')

In [None]:
fel_internal
fel_internal.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/fel_internal_positions.csv')