# Yarrowia Annotations from KEGG
The data in the kegg_annotations.txt is copied-and-pasted from these urls:
[page 1](https://www.genome.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&locale=en&serv=gn&dbkey=yli&keywords=YALI0&page=1)
[page 2](https://www.genome.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&locale=en&serv=gn&dbkey=yli&keywords=YALI0&page=2)
[page 3](https://www.genome.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&locale=en&serv=gn&dbkey=yli&keywords=YALI0&page=3)
[page 4](https://www.genome.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&locale=en&serv=gn&dbkey=yli&keywords=YALI0&page=4)
[page 5](https://www.genome.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&locale=en&serv=gn&dbkey=yli&keywords=YALI0&page=5)
[page 6](https://www.genome.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&locale=en&serv=gn&dbkey=yli&keywords=YALI0&page=6)
[page 7](https://www.genome.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&locale=en&serv=gn&dbkey=yli&keywords=YALI0&page=7)
[page 8](https://www.genome.jp/dbget-bin/www_bfind_sub?mode=bfind&max_hit=1000&locale=en&serv=gn&dbkey=yli&keywords=YALI0&page=8)

### Load imports

In [1]:
import pandas as pd

### Load KEGG text file, and process it to a more usable format

In [2]:
# load text file from data/kegg_annotations.txt
kegg_df = pd.read_csv('../data/kegg_annotations.txt', sep='\t', header=None)

# remove odd rows
kegg_df = kegg_df[kegg_df.index % 2 != 0]

# split into two columns by location of | character
kegg_df = kegg_df[0].str.split('|', expand=True)

# rename columns kegg_annotation and yali0_id
kegg_df.columns = ['kegg_annotation', 'yali0_id']

# remove 'RefSeq' from yali0_id
kegg_df['yali0_id'] = kegg_df['yali0_id'].str.replace('(RefSeq) ', '')

# strip whitespace from yali0_id
kegg_df['yali0_id'] = kegg_df['yali0_id'].str.strip()

kegg_df.head(20)


Unnamed: 0,kegg_annotation,yali0_id
1,no KO assigned,YALI0A00110p
3,K03283 heat shock 70kDa protein 1/2/6/8,YALI0A00132p
5,no KO assigned,YALI0A00154p
7,no KO assigned,YALI0A00176p
9,no KO assigned,YALI0A00198p
11,no KO assigned,YALI0A00212p
13,K14219 tRNA Arg,tRNA-Arg
15,K01870 isoleucyl-tRNA synthetase [EC:6.1.1.5],YALI0A00264p
17,K06944 developmentally-regulated GTP-binding p...,YALI0A00286p
19,K18042 serine/threonine/tyrosine-interacting p...,YALI0A00330p


### Define a function to convert an YALI0 gene id to a kegg annotation

In [3]:
def yali0_to_kegg_annotation(yali0_id):
    # replace 'p' with 'g' in yali0_id (p = protein, g = gene)
    yali0_id = yali0_id.replace('g', 'p')

    # filter kegg_df for yali0_id
    row = kegg_df[kegg_df['yali0_id'] == yali0_id]
    
    # if there is no row, return 'unknown
    if len(row) == 0:
        return 'gene not in KEGG'
    
    # get kegg_annotation from row
    kegg_annotation = row['kegg_annotation'].values[0]

    if kegg_annotation == 'no KO assigned ':
        return 'unknown function'

    return kegg_annotation

# test the function
print(yali0_to_kegg_annotation('YALI0A00110p'))
print(yali0_to_kegg_annotation('YALI0A00264p'))

unknown function
K01870 isoleucyl-tRNA synthetase [EC:6.1.1.5] 


### Load Yarrowia JGI to YALI0 mapping

In [4]:
# load previously established 
yarrowia_annotations = pd.read_csv('../data/yarrowia_annotations.csv')
yarrowia_annotations

Unnamed: 0,JGI ID,JGI start,JGI end,NCBI start,NCBI end,NCBI ID,Column1,NCBI
0,jgi.p|Yarli1|64471,2659,5277,2659,5277,YALI0_A00110g,YALI0A00110g,YALI0A00110g
1,jgi.p|Yarli1|64472,7045,8880,7045,8880,YALI0_A00132g,YALI0A00132g,YALI0A00132g
2,jgi.p|Yarli1|64473,11559,12653,11559,12653,YALI0_A00154g,YALI0A00154g,YALI0A00154g
3,jgi.p|Yarli1|64474,15861,18419,15861,18419,YALI0_A00176g,YALI0A00176g,YALI0A00176g
4,jgi.p|Yarli1|64475,20087,20857,20087,20857,YALI0_A00198g,YALI0A00198g,YALI0A00198g
...,...,...,...,...,...,...,...,...
6442,jgi.p|Yarli1|70913,3967026,3973618,3967026,3973618,YALI0_F32043g,YALI0F32043g,YALI0F32043g
6443,jgi.p|Yarli1|70914,3979479,3981272,3979479,3981272,YALI0_F32065g,YALI0F32065g,YALI0F32065g
6444,jgi.p|Yarli1|70915,3984293,3985643,3984293,3985643,YALI0_F32131g,YALI0F32131g,YALI0F32131g
6445,jgi.p|Yarli1|70916,3985832,3989074,3985832,3989074,YALI0_F32153g,YALI0F32153g,YALI0F32153g


### Add KEGG annotations to dataframe

In [5]:
kegg_annotations = []

# loop over genes
for index, row in yarrowia_annotations.iterrows():
    
    # get the yali0_id
    yali0_id = row['NCBI']

    # get the kegg annotation
    kegg_annotation = yali0_to_kegg_annotation(yali0_id)

    # append to the list
    kegg_annotations.append(kegg_annotation)

yarrowia_annotations['kegg_annotation'] = kegg_annotations

yarrowia_annotations

Unnamed: 0,JGI ID,JGI start,JGI end,NCBI start,NCBI end,NCBI ID,Column1,NCBI,kegg_annotation
0,jgi.p|Yarli1|64471,2659,5277,2659,5277,YALI0_A00110g,YALI0A00110g,YALI0A00110g,unknown function
1,jgi.p|Yarli1|64472,7045,8880,7045,8880,YALI0_A00132g,YALI0A00132g,YALI0A00132g,K03283 heat shock 70kDa protein 1/2/6/8
2,jgi.p|Yarli1|64473,11559,12653,11559,12653,YALI0_A00154g,YALI0A00154g,YALI0A00154g,unknown function
3,jgi.p|Yarli1|64474,15861,18419,15861,18419,YALI0_A00176g,YALI0A00176g,YALI0A00176g,unknown function
4,jgi.p|Yarli1|64475,20087,20857,20087,20857,YALI0_A00198g,YALI0A00198g,YALI0A00198g,unknown function
...,...,...,...,...,...,...,...,...,...
6442,jgi.p|Yarli1|70913,3967026,3973618,3967026,3973618,YALI0_F32043g,YALI0F32043g,YALI0F32043g,K19844 GTPase-activating protein BEM2
6443,jgi.p|Yarli1|70914,3979479,3981272,3979479,3981272,YALI0_F32065g,YALI0F32065g,YALI0F32065g,unknown function
6444,jgi.p|Yarli1|70915,3984293,3985643,3984293,3985643,YALI0_F32131g,YALI0F32131g,YALI0F32131g,unknown function
6445,jgi.p|Yarli1|70916,3985832,3989074,3985832,3989074,YALI0_F32153g,YALI0F32153g,YALI0F32153g,K23358 transcription initiation factor TFIID s...


In [6]:
# save as a csv file
yarrowia_annotations.to_csv('../results/yarrowia_annotations_kegg.csv', index=False)