# Yarrowia Annotions from Uniprot

### Load imports

In [1]:
import pandas as pd

### Load uniprot data

In [2]:
# load data/uniprot_annotations.tsv
uniprot_data = pd.read_csv('../data/uniprot_annotations.tsv', sep='\t')
uniprot_data.head()

Unnamed: 0,Entry,Gene Names,Gene Ontology (molecular function)
0,P09230,XPR2 YALI0F31889g,serine-type endopeptidase activity [GO:0004252]
1,P34229,FAS1 YALI0B15059g,(3R)-hydroxymyristoyl-[acyl-carrier-protein] d...
2,Q6C093,UBC2 YALI0F26697g,ATP binding [GO:0005524]; proteasome binding [...
3,Q6C1X5,ARO1 YALI0F12639g,3-dehydroquinate dehydratase activity [GO:0003...
4,Q6C2F5,ATG9 YALI0F08349g,phospholipid scramblase activity [GO:0017128]


### Fix gene names so that they match the KEGG format

In [3]:
# loop over the rows of uniprot_data
for _, row in uniprot_data.iterrows():
    # get the gene name
    gene_name = row['Gene Names']

    # split the gene name by spaces
    gene_name_split = gene_name.split(' ')

    # only keep the word that has 'YALI0' in it
    YALI0_gene_name = ''
    for word in gene_name_split:
        if 'YALI0' in word:
            YALI0_gene_name = word

    # remove the underscore character if it exists
    # if '_' in YALI0_gene_name:
    YALI0_gene_name = YALI0_gene_name.replace('_', '')
    
    # update the gene name column with the new gene name
    uniprot_data.loc[_, 'Gene Names'] = YALI0_gene_name

uniprot_data.head(50)
    

Unnamed: 0,Entry,Gene Names,Gene Ontology (molecular function)
0,P09230,YALI0F31889g,serine-type endopeptidase activity [GO:0004252]
1,P34229,YALI0B15059g,(3R)-hydroxymyristoyl-[acyl-carrier-protein] d...
2,Q6C093,YALI0F26697g,ATP binding [GO:0005524]; proteasome binding [...
3,Q6C1X5,YALI0F12639g,3-dehydroquinate dehydratase activity [GO:0003...
4,Q6C2F5,YALI0F08349g,phospholipid scramblase activity [GO:0017128]
5,Q6C354,YALI0F02497g,"2 iron, 2 sulfur cluster binding [GO:0051537];..."
6,Q6C4M9,YALI0E25135g,ATP binding [GO:0005524]; MAP kinase activity ...
7,Q6C6R8,YALI0E06831g,peroxisome matrix targeting signal-1 binding [...
8,Q6C710,YALI0E04675g,histone H3K4 acetyltransferase activity [GO:00...
9,Q6C793,YALI0E02684g,2-methylcitrate synthase activity [GO:0050440]...


### Remove rows with blank 'Gene Names'	or 'Gene Ontology (molecular function)'

In [4]:
# get the total number of genes in the uniprot dataset
total_genes = len(uniprot_data)

# remove rows with blank 'Gene Names'	or 'Gene Ontology (molecular function)'
uniprot_data = uniprot_data[uniprot_data['Gene Names'].notna()]
uniprot_data = uniprot_data[uniprot_data['Gene Ontology (molecular function)'].notna()]

# get the number of genes with annotations
uniprot_annotated_genes = len(uniprot_data)

print(f'Of the {total_genes} genes in the uniprot dataset, {uniprot_annotated_genes} have annotations')

# display the uniprot data
uniprot_data

Of the 6454 genes in the uniprot dataset, 4057 have annotations


Unnamed: 0,Entry,Gene Names,Gene Ontology (molecular function)
0,P09230,YALI0F31889g,serine-type endopeptidase activity [GO:0004252]
1,P34229,YALI0B15059g,(3R)-hydroxymyristoyl-[acyl-carrier-protein] d...
2,Q6C093,YALI0F26697g,ATP binding [GO:0005524]; proteasome binding [...
3,Q6C1X5,YALI0F12639g,3-dehydroquinate dehydratase activity [GO:0003...
4,Q6C2F5,YALI0F08349g,phospholipid scramblase activity [GO:0017128]
...,...,...,...
6448,W0TYP4,YALI0B03080g,catalytic activity [GO:0003824]
6449,W0TYP9,YALI0B09977g,protein-macromolecule adaptor activity [GO:003...
6450,W0TYQ0,YALI0B15722g,ATP binding [GO:0005524]; phosphorelay sensor ...
6451,W0TYQ3,YALI0A01602g,damaged DNA binding [GO:0003684]; single-stran...


### Make a function to convert YALI0 gene ids to uniprot annotations

In [5]:
def yali0_to_uniprot_annotation(yali0_id):
    # get the uniprot annotations for the gene
    uniprot_annotations = uniprot_data[uniprot_data['Gene Names'].str.contains(yali0_id)]
    
    # if there are no annotations, return None
    if len(uniprot_annotations) == 0:
        return 'unknown function'
    
    # if there are annotations, return the first one
    else:
        return uniprot_annotations.iloc[0]['Gene Ontology (molecular function)']
    
yali0_to_uniprot_annotation('YALI0A00110g')

'oligopeptide transmembrane transporter activity [GO:0035673]'

### Load yarrowia KEGG annotations

In [6]:
# load results/yarrowia_annotations.csv
yarrowia_annotations = pd.read_csv('../results/yarrowia_annotations_kegg.csv')
yarrowia_annotations.head()

Unnamed: 0,JGI ID,JGI start,JGI end,NCBI start,NCBI end,NCBI ID,Column1,NCBI,kegg_annotation
0,jgi.p|Yarli1|64471,2659,5277,2659,5277,YALI0_A00110g,YALI0A00110g,YALI0A00110g,unknown function
1,jgi.p|Yarli1|64472,7045,8880,7045,8880,YALI0_A00132g,YALI0A00132g,YALI0A00132g,K03283 heat shock 70kDa protein 1/2/6/8
2,jgi.p|Yarli1|64473,11559,12653,11559,12653,YALI0_A00154g,YALI0A00154g,YALI0A00154g,unknown function
3,jgi.p|Yarli1|64474,15861,18419,15861,18419,YALI0_A00176g,YALI0A00176g,YALI0A00176g,unknown function
4,jgi.p|Yarli1|64475,20087,20857,20087,20857,YALI0_A00198g,YALI0A00198g,YALI0A00198g,unknown function


### Add Uniprot annotations as a column to the KEGG annotations

In [7]:
# add uniprot annotations as a column to the KEGG annotations
uniprot_annotation_list = []

# iterate over the rows of the KEGG dataframe
for _, row in yarrowia_annotations.iterrows():
    # get the YALI0 gene name
    YALI0_gene_name = row['NCBI']

    # get the uniprot annotation
    uniprot_annotation = yali0_to_uniprot_annotation(YALI0_gene_name)

    # append to the list
    uniprot_annotation_list.append(uniprot_annotation)

# add the list as a column to the dataframe
yarrowia_annotations['uniprot_annotation'] = uniprot_annotation_list

# display the dataframe
yarrowia_annotations

Unnamed: 0,JGI ID,JGI start,JGI end,NCBI start,NCBI end,NCBI ID,Column1,NCBI,kegg_annotation,uniprot_annotation
0,jgi.p|Yarli1|64471,2659,5277,2659,5277,YALI0_A00110g,YALI0A00110g,YALI0A00110g,unknown function,oligopeptide transmembrane transporter activit...
1,jgi.p|Yarli1|64472,7045,8880,7045,8880,YALI0_A00132g,YALI0A00132g,YALI0A00132g,K03283 heat shock 70kDa protein 1/2/6/8,ATP binding [GO:0005524]; ATP hydrolysis activ...
2,jgi.p|Yarli1|64473,11559,12653,11559,12653,YALI0_A00154g,YALI0A00154g,YALI0A00154g,unknown function,unknown function
3,jgi.p|Yarli1|64474,15861,18419,15861,18419,YALI0_A00176g,YALI0A00176g,YALI0A00176g,unknown function,unknown function
4,jgi.p|Yarli1|64475,20087,20857,20087,20857,YALI0_A00198g,YALI0A00198g,YALI0A00198g,unknown function,unknown function
...,...,...,...,...,...,...,...,...,...,...
6442,jgi.p|Yarli1|70913,3967026,3973618,3967026,3973618,YALI0_F32043g,YALI0F32043g,YALI0F32043g,K19844 GTPase-activating protein BEM2,GTPase activator activity [GO:0005096]; guanyl...
6443,jgi.p|Yarli1|70914,3979479,3981272,3979479,3981272,YALI0_F32065g,YALI0F32065g,YALI0F32065g,unknown function,unknown function
6444,jgi.p|Yarli1|70915,3984293,3985643,3984293,3985643,YALI0_F32131g,YALI0F32131g,YALI0F32131g,unknown function,triglyceride lipase activity [GO:0004806]
6445,jgi.p|Yarli1|70916,3985832,3989074,3985832,3989074,YALI0_F32153g,YALI0F32153g,YALI0F32153g,K23358 transcription initiation factor TFIID s...,histone acetyltransferase activity [GO:0004402...


In [8]:
# save the yarrowia_annotations dataframe to a xlsx file
yarrowia_annotations.to_excel('../results/yarrowia_annotations_kegg_uniprot.xlsx', index=False)

### Check annotation completeness

In [9]:
both_annotated_list = []
only_kegg_list = []
only_uniprot_list = []
neither_annotated_list = []

temp = []

# loop over rows in the dataframe
for _, row in yarrowia_annotations.iterrows():
    has_kegg_annotation = True
    has_uniprot_annotation = True

    # get kegg and uniprot annotations
    kegg_annotation = row['kegg_annotation']
    uniprot_annotation = row['uniprot_annotation']

    # update has_kegg_annotation
    if kegg_annotation == 'gene not in KEGG' or kegg_annotation == 'unknown function':
        has_kegg_annotation = False

    # update has_uniprot_annotation
    if uniprot_annotation == 'unknown function':
        has_uniprot_annotation = False

    # update lists
    if has_kegg_annotation and has_uniprot_annotation:
        both_annotated_list.append(row)
    elif has_kegg_annotation and not has_uniprot_annotation:
        only_kegg_list.append(row)
    elif not has_kegg_annotation and has_uniprot_annotation:
        only_uniprot_list.append(row)
    else:
        neither_annotated_list.append(row)
        
# print total number of genes
total_genes = len(both_annotated_list) + len(only_kegg_list) + len(only_uniprot_list) + len(neither_annotated_list)
print(f'total number of genes: {total_genes}')

# print lengths of lists
print(f'both_annotated_list: {len(both_annotated_list)} ({100 * len(both_annotated_list) / total_genes:.1f}%)')
print(f'only_kegg_list: {len(only_kegg_list)} ({100 * len(only_kegg_list) / total_genes:.1f}%)')
print(f'only_uniprot_list: {len(only_uniprot_list)} ({100 * len(only_uniprot_list) / total_genes:.1f}%)')
print(f'neither_annotated_list: {len(neither_annotated_list)} ({100 * len(neither_annotated_list) / total_genes:.1f}%)')

total number of genes: 6447
both_annotated_list: 2882 (44.7%)
only_kegg_list: 634 (9.8%)
only_uniprot_list: 1148 (17.8%)
neither_annotated_list: 1783 (27.7%)
