In [1]:
import egglib
import glob 
import pandas as pd
import numpy as np
import os
import seaborn as sns

Goal: calculate nucleotide diversity per site and Tajima's D for NLR clades using the egglib python package 

In [2]:
#gather alignments in fasta format 
directory='/global/scratch/users/chandlersutherland/e14/popgen/popgenome_test'
files = glob.glob(os.path.join(directory, "*.fas"))
#import nlr annotations generated by nlr_annotation.ipynb 
annotations=pd.read_csv('/global/scratch/users/chandlersutherland/e14/popgen/nlr_aa_annotation.csv')[['clade', 'start', 'end', 'domain']]

In [3]:
#write a function that takes in protein alignment coordinates and clade and outputs the desired stats 
def sub_stat(row):
    #load alignment, and subset by domain
    fasta='/global/scratch/users/chandlersutherland/e14/popgen/popgenome_test/'+row[0]+'.pal2nal.fas'
    clade=row[0]
    domain=row[3]
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)
    codon=egglib.tools.to_codons(aln)
    subset=codon.extract(int(row[1]), int(row[2]))
    
    #compute across subset stats
    dna=egglib.tools.to_bases(subset)
    ls=egglib.stats.ComputeStats()
    ls.configure(multi_hits=True, multi=False)
    ls.add_stats('Pi', 'lseff', 'D')
    stats=ls.process_align(dna, max_missing=0.1)
    
    lseff=stats['lseff']
    D=stats['D']
    Pi=stats['Pi']
    dna_aln=dna.ls
    
    #compute per synonymous and nonsynonomous site stats 
    cs_subset = egglib.stats.CodingDiversity(subset)

    #use the process_sites function to calculate Pi over just the synonymous and non-synonymous sites 
    statsS=ls.process_sites(cs_subset.sites_S)
    statsNS=ls.process_sites(cs_subset.sites_NS) 

    #normalize by the number of synonymous and nonsynonymous sites
    #if there are no polymorphisms at each site, return NaN
    
    if cs_subset.num_pol_S == 0:
        PiS=np.nan
    else:
        PiS=statsS['Pi']/cs_subset.num_sites_S
    
    if cs_subset.num_pol_NS == 0:
        PiNS=np.nan
    else:
        PiNS=statsNS['Pi']/cs_subset.num_sites_NS
    
    return({'clade':clade, 'domain':domain, 'Pi_raw':Pi, 'lseff':lseff, 'dna_aln':dna_aln, 'PiS':PiS, 'PiN':PiNS, 'D':D})

In [None]:
stats=pd.DataFrame()
#calculate per domain stats 
for i in range(0, len(annotations)):
    row=annotations.iloc[i,:]
    result=sub_stat(row)
    result['start']=row[1]
    result['stop']=row[2]
    #print(result)
    stats=stats.append(result, ignore_index=True)

stats

In [4]:
#add Col-0 gene names and HV label
gene_table=pd.read_table('/global/scratch/users/chandlersutherland/e14/popgen/Atha_NLRome_GeneTable.txt')[['Gene', 'Clade', 'Ecotype', 'HV']]
gene_table=gene_table[gene_table['Ecotype']=='ATHALIANA']
gene_table['Gene']=gene_table['Gene'].str.split('_', expand=True)[1]
gene_table=gene_table.rename(columns={'Clade':'clade'})[['Gene', 'clade', 'HV']]
gene_table=gene_table.assign(HV=gene_table.HV.map({0:'non-hv', 1:'hv'}))

egglib_results=pd.merge(stats, gene_table)
egglib_results

In [None]:
egglib_results.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/egglib_results.tsv', sep='\t')

Complement the per domain analysis with sliding window

In [6]:
cds_only=annotations[annotations['domain']=='cds']

In [7]:
#function that takes in a clade name, and evaluates Pi, PiS, PiNS, and D across sliding windows of window size 33 codons and step size 10 codons 
#outputs a dataframe with the results across the codon windows 
def my_slider(row, window, step):
    #get codon length 
    codon_length=row[2]
    clade=row[0]
    
    #generate a new annotations dataframe with sliding windows to this length 
    fake_annotations=pd.DataFrame()
    for i in range(0, int(codon_length/step)):
        fake_annotations=fake_annotations.append({'start':0+i*step, 'end':window+i*step, 'clade':clade, 'domain':'window'}, ignore_index=True)
    fake_annotations=fake_annotations[['clade', 'start', 'end', 'domain']]
    return fake_annotations

In [8]:
#create new annotations df with sliding windows
slide=[]
for i in range(0, len(cds_only)):
    row=cds_only.iloc[i,:]
    windows=my_slider(row, 100, 25)
    slide.append(windows)

slide_df=pd.concat(slide)
slide_df

Unnamed: 0,clade,start,end,domain
0,Int11629_24_35_L_12,0.0,100.0,window
1,Int11629_24_35_L_12,25.0,125.0,window
2,Int11629_24_35_L_12,50.0,150.0,window
3,Int11629_24_35_L_12,75.0,175.0,window
4,Int11629_24_35_L_12,100.0,200.0,window
...,...,...,...,...
100,Int12432_284,2500.0,2600.0,window
101,Int12432_284,2525.0,2625.0,window
102,Int12432_284,2550.0,2650.0,window
103,Int12432_284,2575.0,2675.0,window


In [9]:
window_stats=pd.DataFrame()
for i in range(0, len(slide_df)):
    row=slide_df.iloc[i,:]
    result=sub_stat(row)
    result['start']=row[1]
    result['stop']=row[2]
    #print(result)
    window_stats=window_stats.append(result, ignore_index=True)

window_stats

egglib_window_final=pd.merge(window_stats, gene_table)
egglib_window_final.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/egglib_window_stats.csv')

In [12]:
vignette=slide_df[slide_df['clade'].isin(['Int8443_258_351_L_88_120_L_38','Int8443_258_351_L_88_120_R_50', "Int14387_212_253_R_34", "Int14387_212_251_L_60", "Int14387_212_343_R_44"])]
for i in range(0, len(vignette)):
    row=slide_df.iloc[i,:]
    result=sub_stat(row)
    result['start']=row[1]
    result['stop']=row[2]
    #print(result)
    window_stats=window_stats.append(result, ignore_index=True)

window_stats

Unnamed: 0,D,PiN,PiS,Pi_raw,clade,dna_aln,domain,lseff,start,stop
0,-1.18339,0.041878,0.147875,13.051515,Int11629_24_35_L_12,300.0,window,267.0,0.0,100.0
1,-0.436408,0.041878,0.147875,7.724242,Int11629_24_35_L_12,300.0,window,300.0,25.0,125.0
2,-0.565476,0.032958,0.082698,1.518182,Int11629_24_35_L_12,300.0,window,300.0,50.0,150.0
3,-1.11173,,,0.200000,Int11629_24_35_L_12,300.0,window,300.0,75.0,175.0
4,-1.11173,,,0.200000,Int11629_24_35_L_12,300.0,window,300.0,100.0,200.0
...,...,...,...,...,...,...,...,...,...,...
164,-1.1451,0.015191,0.021417,3.290914,Int9687_297_427_R_29,300.0,window,246.0,450.0,550.0
165,-0.276613,0.017414,0.022000,5.692666,Int9687_297_427_R_29,300.0,window,219.0,475.0,575.0
166,0.625087,0.020245,0.026025,5.155629,Int9687_297_427_R_29,300.0,window,219.0,500.0,600.0
167,0.814388,,,4.462963,Int9687_297_427_R_29,300.0,window,222.0,525.0,625.0


In [14]:
vignette_window_results=pd.merge(window_stats, gene_table)
vignette_window_results
#vignette_window_results.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/egglib_vignette_window_stats.csv')

Unnamed: 0,D,PiN,PiS,Pi_raw,clade,dna_aln,domain,lseff,start,stop,Gene,HV
0,,,,0.000000,Int9156_236_297_R_14,300.0,window,267.0,0.0,100.0,AT5G48770,non-hv
1,,,,0.000000,Int9156_236_297_R_14,300.0,window,300.0,25.0,125.0,AT5G48770,non-hv
2,,,,0.000000,Int9156_236_297_R_14,300.0,window,300.0,50.0,150.0,AT5G48770,non-hv
3,-0.847866,0.004045,,0.934066,Int9156_236_297_R_14,300.0,window,300.0,75.0,175.0,AT5G48770,non-hv
4,-0.847866,0.004016,,0.934066,Int9156_236_297_R_14,300.0,window,300.0,100.0,200.0,AT5G48770,non-hv
...,...,...,...,...,...,...,...,...,...,...,...,...
66,-1.1451,0.015191,0.021417,3.290914,Int9687_297_427_R_29,300.0,window,246.0,450.0,550.0,AT1G63870,non-hv
67,-0.276613,0.017414,0.022000,5.692666,Int9687_297_427_R_29,300.0,window,219.0,475.0,575.0,AT1G63870,non-hv
68,0.625087,0.020245,0.026025,5.155629,Int9687_297_427_R_29,300.0,window,219.0,500.0,600.0,AT1G63870,non-hv
69,0.814388,,,4.462963,Int9687_297_427_R_29,300.0,window,222.0,525.0,625.0,AT1G63870,non-hv


In [None]:
egglib_window_final=pd.merge(egglib_window_results, gene_table)
egglib_window_final.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/egglib_window_stats.csv')