In [1]:
import egglib
import glob 
import pandas as pd
import numpy as np
import os
import seaborn as sns

Goal: generate a dataframe with the majority vote NLR annotations across each clade

In [2]:
#gather alignments in fasta format 
directory='/global/scratch/users/chandlersutherland/e14/popgen/popgenome_test'
files = glob.glob(os.path.join(directory, "*.fas"))

Let's compute by domain statistics. First, read in the annotations from the pan-nlrome. Then, convert to an alignment index and take the majority vote. 

In [3]:
tbl = pd.read_table('/global/scratch/users/chandlersutherland/e14/popgen/panl-nrlome-nlr-domains.tsv', error_bad_lines=False, names=['Gene', 'read', 'length', 'source', 'code', 'domain', 'start', 'stop', 'p', 'unknown', 'date', 'interpro', 'domain_2'])

In [4]:
#what annotations are available for each clade? 
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)

    names=[]
    for i in range(0, aln.ns):
        name=aln.get_sample(i).name
        if name.startswith('ATHALIANA'):
        #using just the primary transcript
            clean=name.replace('ATHALIANA_', '6909|').replace('_1', '')+'.1'
        else:
            clean=name.replace('_', '|')
        names.append(clean)

    per_gene=tbl[tbl['Gene'].isin(names)]
    print(clade, ':', per_gene['domain'].unique())

Int11629_24_35_L_12 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int8443_258_453_R_8 : ['NB-ARC domain' 'maj_vote_coils_paircoils_nlrparser'
 'Leucine rich repeat']
Int9156_236_297_R_14 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int9687_297_427_R_29 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int12133_61 : ['NB-ARC domain' 'TIR domain' 'Leucine Rich Repeat']
Int8443_258_360_R_43 : ['NB-ARC domain' 'maj_vote_coils_paircoils_nlrparser'
 'Leucine rich repeat']
Int9687_297_557_R_19 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int11247_267_326_R_108_147_R_52_64_R_9 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat'
 'Leucine rich repeats (6 copies)']
Int14642_297_369_R_46 : ['maj_vote_coils_paircoils_nlrparser' 'NB-ARC domain']
Int8443_258_351_L_88_120_R_50 : ['NB-ARC domain' 'Leucine rich repeat'
 'maj_vote_coils_paircoils_nlrparser']
Int7765_208_251_L_43 : ['NB-ARC domain' 'maj_vote_coils_paircoils_nlrparser'
 'Leucine rich repeat']
Int10637_3

Int10172_376_495_R_128 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int11061_33_35_R_4 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int11547_65_120_L_59 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int9983_190_313_R_24 : ['Leucine Rich Repeat' 'NB-ARC domain' 'TIR domain']
Int9457_231_264_R_50 : ['NB-ARC domain' 'TIR domain' 'Leucine Rich Repeat']
Int12194_239_349_R_60 : ['NB-ARC domain' 'Leucine Rich Repeat' 'TIR domain']
Int10172_376_424_R_11 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int10637_304_546_R_11 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int10172_376_624_R_44 : ['TIR domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int14979_63_105_L_46 : ['NB-ARC domain']
Int12974_336_507_R_56 : ['WRKY DNA -binding domain' 'NB-ARC domain' 'Leucine Rich Repeat']
Int7973_410_712_R_35 : ['NB-ARC domain' 'maj_vote_coils_paircoils_nlrparser'
 'Leucine rich repeat']
Int8760_26_29_L_4 : ['NB-ARC domain' 'maj_vote_coils_paircoils_nlrparser'
 'Leu

Define a few functions that will 1. get the names of the genes in each clade from the protein alignment file, 2. get the annotations for each gene, 3. reassign the DNA index to the protein index, 3. convert the annotation coordinates, 4. calculate a majority vote across the protein alignment. 

In [5]:
#from the fasta file, check the alignment object and return a list of gene names 
def get_names(fasta):
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)

    names=[]
    for i in range(0, aln.ns):
        name=aln.get_sample(i).name
        if name.startswith('ATHALIANA'):
        #using just the primary transcript
            clean=name.replace('ATHALIANA_', '6909|').replace('_1', '')+'.1'
        else:
            clean=name.replace('_', '|')
        names.append(clean)

    return(names)

#with those gene names, filter the annotation table 
def get_annotation(df, names):
    per_gene=df[df['Gene'].isin(names)]
    return(per_gene)

#function to reassign the index. Takes in an amino acid sequence, and iterates through. 
def reassign_index(aa_sequence):
    #generate an array of zeros the length of the protein 
    real_index=[0]*len(aa_sequence)
    #start counting at one, not standard python index 
    num=0
    
    #iterate through protein sequence, skipping the gaps  
    for aa in enumerate(aa_sequence): 
        index=aa[0]
        if aa[1] != '-':
            num=num+1
            real_index[index]=num
        if aa[1] == '-':
            real_index[index]=num 
    
    return real_index 

#function that outputs new coordinates that work for the alignment index 
def convert_coord(new_index, coord):
    #convert to a 1 based indexing 
    return [i for i, x in enumerate(new_index) if x == coord+1]

#write a function that takes in a list of genes and a df from the domain annotation file 
#and outputs the majority vote alignment coordinates for that domain 
def majority_vote(gene_list, df, fasta):
    results=pd.DataFrame()
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)
    aln.to_codons()
    prot = egglib.tools.translate(aln)

    for seq in enumerate(gene_list):
       #load sequence 
        aa_seq=prot.get_sequence(seq[0])

        #and name from pal2nal alignment
        name=prot.get_sample(seq[0])[0]

        if name.startswith('ATHALIANA'):
                #using just the primary transcript
            clean=name.replace('ATHALIANA_', '6909|').replace('_1', '')+'.1'
        else:
            clean=name.replace('_', '|')
        
        #currently skipping when there are double annotations. For better or for worse.
        if (clean in df['Gene'].unique() and len(df.loc[df['Gene']==clean])==1):
            start=df.loc[df['Gene']==clean]['start'].item()
            stop=df.loc[df['Gene']==clean]['stop'].item()

            new_index=reassign_index(aa_seq)
            new_start=convert_coord(new_index, start)
            new_stop=convert_coord(new_index, stop)
            results=results.append({'aln_start':new_start, 'aln_end':new_stop}, ignore_index=True)
        else: 
            continue 

    results['aln_start']=results['aln_start'].str[0]
    results['aln_end']=results['aln_end'].str[0]

    aln_start=results.mode(axis=0).loc[0, 'aln_start']
    aln_end=results.mode(axis=0).loc[0, 'aln_end']
    return [aln_start, aln_end]

In [8]:
nbarc_tbl=tbl[tbl['domain']=='NB-ARC domain']

#calculate the majority vote across each alignment for the nbarc coordinates 
nbarc_aa_coords=pd.DataFrame()
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    gene_list=get_names(fasta)
    df=get_annotation(nbarc_tbl, gene_list)
    
    try: 
        nbcoords=majority_vote(gene_list, df, fasta)
        nbarc_aa_coords=nbarc_aa_coords.append({'clade':clade, 'start':nbcoords[0], 'end':nbcoords[1], 'domain':'nbarc'}, ignore_index=True)
    except:
        print(clade, ' could not compute nbarc coords')

nbarc_aa_coords



Int12974_336_433_R_14  could not compute nbarc coords
Int12974_336_426_R_8  could not compute nbarc coords


Unnamed: 0,clade,domain,end,start
0,Int11629_24_35_L_12,nbarc,874.0,612.0
1,Int8443_258_453_R_8,nbarc,437.0,157.0
2,Int9156_236_297_R_14,nbarc,452.0,201.0
3,Int9687_297_427_R_29,nbarc,448.0,201.0
4,Int12133_61,nbarc,508.0,250.0
...,...,...,...,...
187,Int9156_236_291_R_7,nbarc,479.0,222.0
188,Int10172_376_423_R_43,nbarc,454.0,190.0
189,Int14939_40_64_R_9,nbarc,451.0,167.0
190,Int11247_267_434_R_35,nbarc,506.0,271.0


What are the issues with Int12974_336_433_R_14 and Int12974_336_426_R_8?

In [None]:
nbarc_tbl
a=get_names('/global/scratch/users/chandlersutherland/e14/popgen/popgenome_test/'+'Int12974_336_433_R_14'+'.pal2nal.fas')
b=get_names('/global/scratch/users/chandlersutherland/e14/popgen/popgenome_test/'+'Int12974_336_426_R_8'+'.pal2nal.fas')

get_annotation(nbarc_tbl, a) 
get_annotation(nbarc_tbl,b)

They each have double the NBARC domains. No Col-0 reps, so moving on for now. 

In [9]:
tir_tbl=tbl[tbl['domain']=='TIR domain']

tir_aa_coords=pd.DataFrame()
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    gene_list=get_names(fasta)
    df=get_annotation(tir_tbl, gene_list)
    
    try: 
        tircoords=majority_vote(gene_list, df, fasta)
        tir_aa_coords=tir_aa_coords.append({'clade':clade, 'start':tircoords[0], 'end':tircoords[1], 'domain':'tir'}, ignore_index=True)
    except:
        print(clade, ' could not compute tir coords')

tir_aa_coords



Int8443_258_453_R_8  could not compute tir coords
Int8443_258_360_R_43  could not compute tir coords
Int14642_297_369_R_46  could not compute tir coords
Int8443_258_351_L_88_120_R_50  could not compute tir coords
Int7765_208_251_L_43  could not compute tir coords
Int14174_152_277_R_8  could not compute tir coords
Int8443_258_352_R_4  could not compute tir coords
Int8908_92_114_R_28  could not compute tir coords
Int14052_123_185_L_62  could not compute tir coords
Int7973_410_746_R_58  could not compute tir coords
Int14328_59  could not compute tir coords
Int14034_18  could not compute tir coords
Int9000_84_90_R_17  could not compute tir coords
Int14174_152_254_R_24  could not compute tir coords
Int8443_258_447_R_7  could not compute tir coords
Int8786_59  could not compute tir coords
Int12974_336_392_R_32  could not compute tir coords
Int14642_297_350_R_17  could not compute tir coords
Int14174_152_193_R_60  could not compute tir coords
Int8845_63  could not compute tir coords
Int14387_

Unnamed: 0,clade,domain,end,start
0,Int11629_24_35_L_12,tir,608.0,407.0
1,Int9156_236_297_R_14,tir,199.0,24.0
2,Int9687_297_427_R_29,tir,191.0,17.0
3,Int12133_61,tir,247.0,55.0
4,Int9687_297_557_R_19,tir,220.0,15.0
...,...,...,...,...
115,Int11093_76_91_R_54,tir,190.0,13.0
116,Int9156_236_291_R_7,tir,212.0,13.0
117,Int10172_376_423_R_43,tir,187.0,13.0
118,Int11247_267_434_R_35,tir,255.0,52.0


In [10]:
cc_tbl=tbl[tbl['domain']=='maj_vote_coils_paircoils_nlrparser']

cc_aa_coords=pd.DataFrame()
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    gene_list=get_names(fasta)
    df=get_annotation(cc_tbl, gene_list)
    
    try: 
        cccoords=majority_vote(gene_list, df, fasta)
        cc_aa_coords=cc_aa_coords.append({'clade':clade, 'start':cccoords[0], 'end':cccoords[1], 'domain':'cc'}, ignore_index=True)
    except:
        continue
        print(clade, ' could not compute cc coords')

cc_aa_coords



Unnamed: 0,clade,domain,end,start
0,Int8443_258_453_R_8,cc,62.0,28.0
1,Int8443_258_360_R_43,cc,64.0,30.0
2,Int14642_297_369_R_46,cc,61.0,13.0
3,Int8443_258_351_L_88_120_R_50,cc,56.0,29.0
4,Int7765_208_251_L_43,cc,159.0,24.0
5,Int8443_258_352_R_4,cc,50.0,16.0
6,Int7973_410_746_R_58,cc,56.0,36.0
7,Int14328_59,cc,529.0,462.0
8,Int8443_258_447_R_7,cc,63.0,29.0
9,Int12974_336_392_R_32,cc,1157.0,1130.0


Daniil ran LRRpredictor on the panNLRome NLRs. Use this to get a better annotation across the clades than the PFAM.

In [13]:
#read in pan nlrome and convert to appropriate format for functions 
pan_lrr_pred=pd.read_csv('/global/scratch/users/chandlersutherland/e14/popgen/Athaliana_panNLRome_lrrpred.txt', sep='\t', index_col=False)
pan_lrr_pred=pan_lrr_pred[pan_lrr_pred['#Prot']!='#Prot']
pan_lrr_pred['pos'] = pd.to_numeric(pan_lrr_pred['pos'])
pan_lrr_pred.dtypes
rough=pan_lrr_pred.groupby('#Prot').pos.agg(['min', 'max']).reset_index()
to_test=rough.rename(columns={'#Prot':'Gene', 'min':'start', 'max':'stop'})
to_test['Gene']=to_test['Gene'].str.replace('_', '|')

#read in Col0 and cleanup 
lrr_pred=pd.read_csv('/global/scratch/users/chandlersutherland/e14/popgen/col-0-nlrome.txt', sep='\t')
col=lrr_pred.groupby('annotation').start.agg(['min', 'max']).reset_index()
col=col.rename(columns={'annotation':'Gene', 'min':'start', 'max':'stop'})
col['Gene']=col['Gene'].str.replace('Athaliana_', '6909|').str.replace('_1', '.1')
col0=col[col['Gene'].str.endswith('.1')]

lrr_tbl=pd.concat([to_test, col0])

In [14]:
lrr_aa_coords=pd.DataFrame()
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    gene_list=get_names(fasta)
    df=get_annotation(lrr_tbl, gene_list)
    
    try: 
        lrrcoords=majority_vote(gene_list, df, fasta)
        lrr_aa_coords=lrr_aa_coords.append({'clade':clade, 'start':lrrcoords[0], 'end':lrrcoords[1], 'domain':'lrr'}, ignore_index=True)
    except:
        print(clade, ' could not compute lrr coords')
        continue

lrr_aa_coords



Int10547_34_50_L_17  could not compute lrr coords


Unnamed: 0,clade,domain,end,start
0,Int11629_24_35_L_12,lrr,1450.0,985.0
1,Int8443_258_453_R_8,lrr,831.0,512.0
2,Int9156_236_297_R_14,lrr,1000.0,537.0
3,Int9687_297_427_R_29,lrr,865.0,572.0
4,Int12133_61,lrr,1175.0,616.0
...,...,...,...,...
188,Int9156_236_291_R_7,lrr,1161.0,569.0
189,Int10172_376_423_R_43,lrr,864.0,561.0
190,Int14939_40_64_R_9,lrr,853.0,529.0
191,Int11247_267_434_R_35,lrr,1037.0,652.0


Add coordinates for the entire cds to calculate gene-wide statistics 

In [11]:
cds_coords=pd.DataFrame() 
for fasta in files: 
    clade=fasta.split('/')[-1].split('.')[0]
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)
    codon=egglib.tools.to_codons(aln)
    length=codon.ls
    cds_coords=cds_coords.append({'clade':clade, 'start':0, 'end':length, 'domain':'cds'}, ignore_index=True)
    
cds_coords

Unnamed: 0,clade,domain,end,start
0,Int11629_24_35_L_12,cds,1621.0,0.0
1,Int8443_258_453_R_8,cds,856.0,0.0
2,Int9156_236_297_R_14,cds,1201.0,0.0
3,Int9687_297_427_R_29,cds,1223.0,0.0
4,Int12133_61,cds,1293.0,0.0
...,...,...,...,...
189,Int9156_236_291_R_7,cds,1198.0,0.0
190,Int10172_376_423_R_43,cds,1126.0,0.0
191,Int14939_40_64_R_9,cds,908.0,0.0
192,Int11247_267_434_R_35,cds,1763.0,0.0


In [19]:
nlr_annotation=pd.concat([nbarc_aa_coords, cds_coords, lrr_aa_coords, tir_aa_coords, cc_aa_coords])
nlr_annotation=nlr_annotation[['clade', 'start', 'end', 'domain']]
nlr_annotation
nlr_annotation.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/nlr_aa_annotation.csv')