# Adding functional annotation from EggNOG-mapper



In [1]:
import pandas as pd
import numpy as np

In [None]:
best_per_gene = '/Users/npapadop/Documents/data/pycnogonum/draft/out.emapper.best.annotations'
emapper = pd.read_csv(best_per_gene, sep='\t', header=0)

In [None]:
def parse_gene_id(x):
    if 'PB' in x:
        parts = x.split('.')
        return '.'.join(parts[:2])
    elif x.startswith('r2') or x.startswith('g') or x.startswith('at'):
        return x.split('.')[0]
    else:
        return ValueError('Unknown gene ID format')

apply the parsing on each row...

In [None]:
emapper['gene'] = emapper['#query'].apply(parse_gene_id)

In [None]:
emapper.set_index('gene', inplace=True)

In [6]:
emapper.loc['PB.3']

#query                                                    PB.3.1.p1
seed_ortholog                                       136037.KDR12978
evalue                                                          0.0
score                                                         570.0
eggNOG_OGs        COG0462@1|root,KOG1503@2759|Eukaryota,38F3E@33...
max_annot_lvl                                         33208|Metazoa
COG_category                                                     EF
Description       ribose phosphate diphosphokinase activity. It ...
Preferred_name                                              PRPSAP1
GOs               GO:0001501,GO:0002189,GO:0003674,GO:0004857,GO...
EC                                                                -
KEGG_ko                                                           -
KEGG_Pathway                                                      -
KEGG_Module                                                       -
KEGG_Reaction                                   

In [None]:
def parse_attributes(x):
    '''Parses a semi-colon separated string into a dictionary

    Parameters
    ----------
    x : str
        a semicolon-separated string that holds attributes
    '''
    attributes = x.split(';')
    if attributes[-1] == '':
        attributes.pop()
    return {attr.split('=')[0]: attr.split('=')[1] for attr in attributes}

In [None]:
emapper.loc['PB.1']['Preferred_name']

'-'

In [None]:
def find_protein(gene_id, lookup): # expects a protein-coding gene as input
    if gene_id in lookup.index:
        name = lookup.loc[gene_id]['Preferred_name']
        if name != '-':
            return name
    return f'Uncharacterised protein {gene_id}'

In [None]:
gff_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted.gff'
named_loc = '/Volumes/scratch/pycnogonum/genome/submission/merged_sorted_named.gff'

with open(gff_loc, 'r') as gff:
    with open(named_loc, 'w') as named:
        gene = ''
        mRNA = ''
        for line in gff:
            line = line.strip()
            conditions_skip = line.startswith('#') or 'tRNA' in line or 'name=' in line
            if not conditions_skip:
                seq_id, source, feature_type, start, end, score, strand, phase, attributes = line.split('\t')
                attributes = parse_attributes(attributes)
                if feature_type == 'gene':
                    gene = attributes['ID']
                    name = find_protein(gene, emapper)
                    line = f'{line}name={name} (predicted)'
                if feature_type == 'mRNA':
                    mRNA = attributes['ID']
                    isoform = mRNA.split('.')[-1]
                    line = f'{line}name={name} (predicted) isoform {isoform}'

            named.write(line + '\n')