In [None]:
import egglib
import glob 
import pandas as pd
import numpy as np
import os
import seaborn as sns

Goal: generate a dataframe with the majority vote NLR annotations across each clade

In [None]:
#gather alignments in fasta format 
directory='/global/scratch/users/chandlersutherland/e14/popgen/popgenome_test'
files = glob.glob(os.path.join(directory, "*.fas"))

Let's compute by domain statistics. First, read in the annotations from the pan-nlrome. Then, convert to an alignment index and take the majority vote. 

In [None]:
tbl = pd.read_table('/global/scratch/users/chandlersutherland/e14/popgen/panl-nrlome-nlr-domains.tsv', error_bad_lines=False, names=['Gene', 'read', 'length', 'source', 'code', 'domain', 'start', 'stop', 'p', 'unknown', 'date', 'interpro', 'domain_2'])

In [None]:
#what annotations are available for each clade? 
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)

    names=[]
    for i in range(0, aln.ns):
        name=aln.get_sample(i).name
        if name.startswith('ATHALIANA'):
        #using just the primary transcript
            clean=name.replace('ATHALIANA_', '6909|').replace('_1', '')+'.1'
        else:
            clean=name.replace('_', '|')
        names.append(clean)

    per_gene=tbl[tbl['Gene'].isin(names)]
    print(clade, ':', per_gene['domain'].unique())

Define a few functions that will 1. get the names of the genes in each clade from the protein alignment file, 2. get the annotations for each gene, 3. reassign the DNA index to the protein index, 3. convert the annotation coordinates, 4. calculate a majority vote across the protein alignment. 

In [None]:
#from the fasta file, check the alignment object and return a list of gene names 
def get_names(fasta):
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)

    names=[]
    for i in range(0, aln.ns):
        name=aln.get_sample(i).name
        if name.startswith('ATHALIANA'):
        #using just the primary transcript
            clean=name.replace('ATHALIANA_', '6909|').replace('_1', '')+'.1'
        else:
            clean=name.replace('_', '|')
        names.append(clean)

    return(names)

#with those gene names, filter the annotation table 
def get_annotation(df, names):
    per_gene=df[df['Gene'].isin(names)]
    return(per_gene)

#function to reassign the index. Takes in an amino acid sequence, and iterates through. 
def reassign_index(aa_sequence):
    #generate an array of zeros the length of the protein 
    real_index=[0]*len(aa_sequence)
    #start counting at one, not standard python index 
    num=0
    
    #iterate through protein sequence, skipping the gaps  
    for aa in enumerate(aa_sequence): 
        index=aa[0]
        if aa[1] != '-':
            num=num+1
            real_index[index]=num
        if aa[1] == '-':
            real_index[index]=num 
    
    return real_index 

#function that outputs new coordinates that work for the alignment index 
def convert_coord(new_index, coord):
    #convert to a 1 based indexing 
    return [i for i, x in enumerate(new_index) if x == coord+1]

#write a function that takes in a list of genes and a df from the domain annotation file 
#and outputs the majority vote alignment coordinates for that domain 
def majority_vote(gene_list, df, fasta):
    results=pd.DataFrame()
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)
    aln.to_codons()
    prot = egglib.tools.translate(aln)

    for seq in enumerate(gene_list):
       #load sequence 
        aa_seq=prot.get_sequence(seq[0])

        #and name from pal2nal alignment
        name=prot.get_sample(seq[0])[0]

        if name.startswith('ATHALIANA'):
                #using just the primary transcript
            clean=name.replace('ATHALIANA_', '6909|').replace('_1', '')+'.1'
        else:
            clean=name.replace('_', '|')
        
        #currently skipping when there are double annotations. For better or for worse.
        if (clean in df['Gene'].unique() and len(df.loc[df['Gene']==clean])==1):
            start=df.loc[df['Gene']==clean]['start'].item()
            stop=df.loc[df['Gene']==clean]['stop'].item()

            new_index=reassign_index(aa_seq)
            new_start=convert_coord(new_index, start)
            new_stop=convert_coord(new_index, stop)
            results=results.append({'aln_start':new_start, 'aln_end':new_stop}, ignore_index=True)
        else: 
            continue 

    results['aln_start']=results['aln_start'].str[0]
    results['aln_end']=results['aln_end'].str[0]

    aln_start=results.mode(axis=0).loc[0, 'aln_start']
    aln_end=results.mode(axis=0).loc[0, 'aln_end']
    return [aln_start, aln_end]

In [None]:
nbarc_tbl=tbl[tbl['domain']=='NB-ARC domain']

#calculate the majority vote across each alignment for the nbarc coordinates 
nbarc_aa_coords=pd.DataFrame()
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    gene_list=get_names(fasta)
    df=get_annotation(nbarc_tbl, gene_list)
    
    try: 
        nbcoords=majority_vote(gene_list, df, fasta)
        nbarc_aa_coords=nbarc_aa_coords.append({'clade':clade, 'start':nbcoords[0], 'end':nbcoords[1], 'domain':'nbarc'}, ignore_index=True)
    except:
        print(clade, ' could not compute nbarc coords')

nbarc_aa_coords

What are the issues with Int12974_336_433_R_14 and Int12974_336_426_R_8?

In [None]:
nbarc_tbl
a=get_names('/global/scratch/users/chandlersutherland/e14/popgen/popgenome_test/'+'Int12974_336_433_R_14'+'.pal2nal.fas')
b=get_names('/global/scratch/users/chandlersutherland/e14/popgen/popgenome_test/'+'Int12974_336_426_R_8'+'.pal2nal.fas')

get_annotation(nbarc_tbl, a) 
get_annotation(nbarc_tbl,b)

They each have double the NBARC domains. No Col-0 reps, so moving on for now. 

In [None]:
tir_tbl=tbl[tbl['domain']=='TIR domain']

tir_aa_coords=pd.DataFrame()
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    gene_list=get_names(fasta)
    df=get_annotation(tir_tbl, gene_list)
    
    try: 
        tircoords=majority_vote(gene_list, df, fasta)
        tir_aa_coords=tir_aa_coords.append({'clade':clade, 'start':tircoords[0], 'end':tircoords[1], 'domain':'tir'}, ignore_index=True)
    except:
        print(clade, ' could not compute tir coords')

tir_aa_coords

In [None]:
cc_tbl=tbl[tbl['domain']=='maj_vote_coils_paircoils_nlrparser']

cc_aa_coords=pd.DataFrame()
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    gene_list=get_names(fasta)
    df=get_annotation(cc_tbl, gene_list)
    
    try: 
        cccoords=majority_vote(gene_list, df, fasta)
        cc_aa_coords=cc_aa_coords.append({'clade':clade, 'start':cccoords[0], 'end':cccoords[1], 'domain':'cc'}, ignore_index=True)
    except:
        continue
        print(clade, ' could not compute cc coords')

cc_aa_coords

Daniil ran LRRpredictor on the panNLRome NLRs. Use this to get a better annotation across the clades than the PFAM.

In [None]:
#read in pan nlrome and convert to appropriate format for functions 
pan_lrr_pred=pd.read_csv('/global/scratch/users/chandlersutherland/e14/popgen/Athaliana_panNLRome_lrrpred.txt', sep='\t', index_col=False)
pan_lrr_pred=pan_lrr_pred[pan_lrr_pred['#Prot']!='#Prot']
pan_lrr_pred['pos'] = pd.to_numeric(pan_lrr_pred['pos'])
pan_lrr_pred.dtypes
rough=pan_lrr_pred.groupby('#Prot').pos.agg(['min', 'max']).reset_index()
to_test=rough.rename(columns={'#Prot':'Gene', 'min':'start', 'max':'stop'})
to_test['Gene']=to_test['Gene'].str.replace('_', '|')

#read in Col0 and cleanup 
lrr_pred=pd.read_csv('/global/scratch/users/chandlersutherland/e14/popgen/col-0-nlrome.txt', sep='\t')
col=lrr_pred.groupby('annotation').start.agg(['min', 'max']).reset_index()
col=col.rename(columns={'annotation':'Gene', 'min':'start', 'max':'stop'})
col['Gene']=col['Gene'].str.replace('Athaliana_', '6909|').str.replace('_1', '.1')
col0=col[col['Gene'].str.endswith('.1')]

lrr_tbl=pd.concat([to_test, col0])

In [None]:
lrr_aa_coords=pd.DataFrame()
for fasta in files:
    clade=fasta.split('/')[-1].split('.')[0]
    gene_list=get_names(fasta)
    df=get_annotation(lrr_tbl, gene_list)
    
    try: 
        lrrcoords=majority_vote(gene_list, df, fasta)
        lrr_aa_coords=lrr_aa_coords.append({'clade':clade, 'start':lrrcoords[0], 'end':lrrcoords[1], 'domain':'lrr'}, ignore_index=True)
    except:
        print(clade, ' could not compute lrr coords')
        continue

lrr_aa_coords

Add coordinates for the entire cds to calculate gene-wide statistics 

In [None]:
cds_coords=pd.DataFrame() 
for fasta in files: 
    clade=fasta.split('/')[-1].split('.')[0]
    aln=egglib.io.from_fasta(fasta, alphabet=egglib.alphabets.DNA)
    codon=egglib.tools.to_codons(aln)
    length=codon.ls
    cds_coords=cds_coords.append({'clade':clade, 'start':0, 'end':length, 'domain':'cds'}, ignore_index=True)
    
cds_coords

In [None]:
nlr_annotation=pd.concat([nbarc_aa_coords, cds_coords, lrr_aa_coords, tir_aa_coords, cc_aa_coords])
nlr_annotation=nlr_annotation[['clade', 'start', 'end', 'domain']]
nlr_annotation
nlr_annotation.to_csv('/global/scratch/users/chandlersutherland/e14/popgen/nlr_aa_annotation.csv')