# Setup

In [4]:
import os
import re
import gzip
import json
from collections import Counter, defaultdict
import sqlite3

import numpy as np
import pandas as pd
import matplotlib as plt
from lxml import etree

from Bio import SeqIO

from IPython.display import display

In [5]:
# Based on: https://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
def etree_fast_iter(context, func, func_args = [], func_kwargs = {}, max_elements = None):
    """
    http://lxml.de/parsing.html#modifying-the-tree
    Based on Liza Daly's fast_iter
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    See also http://effbot.org/zone/element-iterparse.htm
    """
    for i, (event, elem) in enumerate(context):
        func(i, event, elem, *func_args, **func_kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
        if max_elements is not None and i >= max_elements - 1:
            break
    del context

# Create a dataset from Uniref90

## Parse CAFA's GO annotations meta data

> wget https://www.biofunctionprediction.org/cafa-targets/cafa4ontologies.zip

> mkdir cafa4ontologies

> unzip cafa4ontologies.zip -d cafa4ontologies/

In [6]:
with open('/cs/phd/nadavb/cafa_project/data/cafa4ontologies/go.txt', 'r') as f:
    raw_go_meta = f.read()
    
print(raw_go_meta[:2500])

format-version: 1.2
data-version: releases/2019-10-07
subsetdef: gocheck_do_not_annotate "Term not to be used for direct annotation"
subsetdef: gocheck_do_not_manually_annotate "Term not to be used for direct manual annotation"
subsetdef: goslim_agr "AGR slim"
subsetdef: goslim_aspergillus "Aspergillus GO slim"
subsetdef: goslim_candida "Candida GO slim"
subsetdef: goslim_chembl "ChEMBL protein targets summary"
subsetdef: goslim_flybase_ribbon "FlyBase Drosophila GO ribbon slim"
subsetdef: goslim_generic "Generic GO slim"
subsetdef: goslim_metagenomics "Metagenomics GO slim"
subsetdef: goslim_mouse "Mouse GO slim"
subsetdef: goslim_pir "PIR GO slim"
subsetdef: goslim_plant "Plant GO slim"
subsetdef: goslim_pombe "Fission yeast GO slim"
subsetdef: goslim_synapse "synapse GO slim"
subsetdef: goslim_yeast "Yeast GO slim"
synonymtypedef: syngo_official_label "label approved by the SynGO project"
synonymtypedef: systematic_synonym "Systematic synonym" EXACT
default-namespace: gene_ontology


In [7]:
FIELDS = ['id', 'name', 'namespace', 'def', 'is_a', 'synonym', 'alt_id', 'subset', 'is_obsolete', 'xref', \
        'relationship', 'intersection_of', 'disjoint_from', 'consider', 'comment', 'replaced_by', 'created_by', \
        'creation_date', 'property_value']
LIST_FIELDS = {'synonym', 'alt_id', 'subset', 'is_a', 'xref', 'relationship', 'disjoint_from', 'intersection_of', \
        'consider', 'property_value'}

GO_ANNOTATION_PATTERN = re.compile(r'\[Term\]\n((?:\w+\: .*\n?)+)')
FIELD_LINE_PATTERN = re.compile(r'(\w+)\: (.*)')

go_annotations_meta = []

for match in GO_ANNOTATION_PATTERN.finditer(raw_go_meta):
    
    raw_go_annotation = match.group(1)
    go_annotation = {field: [] for field in LIST_FIELDS}
    
    for line in raw_go_annotation.splitlines():
        
        (field, value), = FIELD_LINE_PATTERN.findall(line)
        assert field in FIELDS, (field, raw_go_annotation)
        
        if field in LIST_FIELDS:
            go_annotation[field].append(value)
        else:
            assert field not in go_annotation, (field, raw_go_annotation)
            go_annotation[field] = value
    
    go_annotations_meta.append(go_annotation)

go_annotations_meta = pd.DataFrame(go_annotations_meta, columns = FIELDS)
go_annotations_meta['is_obsolete'] = go_annotations_meta['is_obsolete'].fillna(False)
assert go_annotations_meta['id'].is_unique
go_annotations_meta.set_index('id', drop = True, inplace = True)
go_annotations_meta.insert(0, 'index', np.arange(len(go_annotations_meta)))
display(go_annotations_meta)

Unnamed: 0_level_0,index,name,namespace,def,is_a,synonym,alt_id,subset,is_obsolete,xref,relationship,intersection_of,disjoint_from,consider,comment,replaced_by,created_by,creation_date,property_value
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
GO:0000001,0,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...","[GO:0048308 ! organelle inheritance, GO:004831...","[""mitochondrial inheritance"" EXACT []]",[],[],False,[],[],[],[],[],,,,,[]
GO:0000002,1,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",[GO:0007005 ! mitochondrion organization],[],[],[],False,[],[],[],[],[],,,,,[]
GO:0000003,2,reproduction,biological_process,"""The production of new individuals that contai...",[GO:0008150 ! biological_process],"[""reproductive physiological process"" EXACT []]","[GO:0019952, GO:0050876]","[goslim_agr, goslim_chembl, goslim_flybase_rib...",False,[Wikipedia:Reproduction],[],[],[GO:0044848 ! biological phase],[],,,,,[]
GO:0000005,3,obsolete ribosomal chaperone activity,molecular_function,"""OBSOLETE. Assists in the correct assembly of ...",[],"[""ribosomal chaperone activity"" EXACT []]",[],[],true,[],[],[],[],"[GO:0042254, GO:0044183, GO:0051082]",This term was made obsolete because it refers ...,,,,[]
GO:0000006,4,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...",[GO:0005385 ! zinc ion transmembrane transport...,"[""high affinity zinc uptake transmembrane tran...",[],[],False,[],[],[],[],[],,,,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001313,47370,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways involving...","[GO:0006040 ! amino sugar metabolic process, G...","[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],[],[],[],[],,,pr,2012-03-22T01:19:54Z,[]
GO:2001314,47371,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways resulting...",[GO:0009227 ! nucleotide-sugar catabolic proce...,"[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],[],[],[],[],,,pr,2012-03-22T01:20:01Z,[]
GO:2001315,47372,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways resulting...",[GO:0009226 ! nucleotide-sugar biosynthetic pr...,"[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],[],[],[],[],,,pr,2012-03-22T01:20:05Z,[]
GO:2001316,47373,kojic acid metabolic process,biological_process,"""The chemical reactions and pathways involving...",[GO:0034308 ! primary alcohol metabolic proces...,"[""5-hydroxy-2-(hydroxymethyl)-4H-pyran-4-one m...",[],[],False,[],[],[],[],[],,,rfoulger,2012-04-18T09:22:42Z,[]


In [8]:
go_annotations_meta['direct_children'] = [set() for _ in range(len(go_annotations_meta))]
go_annotations_meta['direct_parents'] = [set() for _ in range(len(go_annotations_meta))]

for go_id, go_annotation in go_annotations_meta.iterrows():
    for raw_is_a in go_annotation['is_a']:
        parent_id, parent_name = raw_is_a.split(' ! ')
        parent_go_annotation = go_annotations_meta.loc[parent_id]
        assert parent_go_annotation['name'] == parent_name
        go_annotation['direct_parents'].add(parent_id)
        parent_go_annotation['direct_children'].add(go_id)
        
display(go_annotations_meta)

Unnamed: 0_level_0,index,name,namespace,def,is_a,synonym,alt_id,subset,is_obsolete,xref,...,intersection_of,disjoint_from,consider,comment,replaced_by,created_by,creation_date,property_value,direct_children,direct_parents
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GO:0000001,0,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...","[GO:0048308 ! organelle inheritance, GO:004831...","[""mitochondrial inheritance"" EXACT []]",[],[],False,[],...,[],[],[],,,,,[],{},"{GO:0048308, GO:0048311}"
GO:0000002,1,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",[GO:0007005 ! mitochondrion organization],[],[],[],False,[],...,[],[],[],,,,,[],{GO:0033955},{GO:0007005}
GO:0000003,2,reproduction,biological_process,"""The production of new individuals that contai...",[GO:0008150 ! biological_process],"[""reproductive physiological process"" EXACT []]","[GO:0019952, GO:0050876]","[goslim_agr, goslim_chembl, goslim_flybase_rib...",False,[Wikipedia:Reproduction],...,[],[GO:0044848 ! biological phase],[],,,,,[],"{GO:0032504, GO:0032505, GO:0061887, GO:001995...",{GO:0008150}
GO:0000005,3,obsolete ribosomal chaperone activity,molecular_function,"""OBSOLETE. Assists in the correct assembly of ...",[],"[""ribosomal chaperone activity"" EXACT []]",[],[],true,[],...,[],[],"[GO:0042254, GO:0044183, GO:0051082]",This term was made obsolete because it refers ...,,,,[],{},{}
GO:0000006,4,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...",[GO:0005385 ! zinc ion transmembrane transport...,"[""high affinity zinc uptake transmembrane tran...",[],[],False,[],...,[],[],[],,,,,[],{},{GO:0005385}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001313,47370,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways involving...","[GO:0006040 ! amino sugar metabolic process, G...","[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],...,[],[],[],,,pr,2012-03-22T01:19:54Z,[],"{GO:2001315, GO:2001314}","{GO:0006040, GO:0006793, GO:0009225}"
GO:2001314,47371,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways resulting...",[GO:0009227 ! nucleotide-sugar catabolic proce...,"[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],...,[],[],[],,,pr,2012-03-22T01:20:01Z,[],{},"{GO:0009227, GO:0046348, GO:2001313}"
GO:2001315,47372,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways resulting...",[GO:0009226 ! nucleotide-sugar biosynthetic pr...,"[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],...,[],[],[],,,pr,2012-03-22T01:20:05Z,[],{},"{GO:2001313, GO:0009226, GO:0046349}"
GO:2001316,47373,kojic acid metabolic process,biological_process,"""The chemical reactions and pathways involving...",[GO:0034308 ! primary alcohol metabolic proces...,"[""5-hydroxy-2-(hydroxymethyl)-4H-pyran-4-one m...",[],[],False,[],...,[],[],[],,,rfoulger,2012-04-18T09:22:42Z,[],{GO:2001317},"{GO:0042180, GO:0046483, GO:1901360, GO:0034308}"


In [9]:
def get_index_to_all_ancestors(index_to_direct_children, root_indices):
    
    index_to_all_ancestors = {index: {index} for index in index_to_direct_children.keys()}
    indices_to_scan = set(root_indices)
    
    while indices_to_scan:
        
        scanned_child_indices = set()
        
        for index in indices_to_scan:
            for child_index in index_to_direct_children[index]:
                index_to_all_ancestors[child_index].update(index_to_all_ancestors[index])
                scanned_child_indices.add(child_index)
                
        indices_to_scan = scanned_child_indices
        
    return index_to_all_ancestors
        
go_annotations_meta['all_ancestors'] = pd.Series(get_index_to_all_ancestors(\
        go_annotations_meta['direct_children'].to_dict(), \
        go_annotations_meta[go_annotations_meta['direct_parents'].apply(bool)].index))
go_annotations_meta['all_offsprings'] = pd.Series(get_index_to_all_ancestors(\
        go_annotations_meta['direct_parents'].to_dict(), \
        go_annotations_meta[go_annotations_meta['direct_children'].apply(bool)].index))
display(go_annotations_meta)

Unnamed: 0_level_0,index,name,namespace,def,is_a,synonym,alt_id,subset,is_obsolete,xref,...,consider,comment,replaced_by,created_by,creation_date,property_value,direct_children,direct_parents,all_ancestors,all_offsprings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GO:0000001,0,mitochondrion inheritance,biological_process,"""The distribution of mitochondria, including t...","[GO:0048308 ! organelle inheritance, GO:004831...","[""mitochondrial inheritance"" EXACT []]",[],[],False,[],...,[],,,,,[],{},"{GO:0048308, GO:0048311}","{GO:0051640, GO:0071840, GO:0051179, GO:005164...",{GO:0000001}
GO:0000002,1,mitochondrial genome maintenance,biological_process,"""The maintenance of the structure and integrit...",[GO:0007005 ! mitochondrion organization],[],[],[],False,[],...,[],,,,,[],{GO:0033955},{GO:0007005},"{GO:0071840, GO:0000002, GO:0009987, GO:000699...",{GO:0000002}
GO:0000003,2,reproduction,biological_process,"""The production of new individuals that contai...",[GO:0008150 ! biological_process],"[""reproductive physiological process"" EXACT []]","[GO:0019952, GO:0050876]","[goslim_agr, goslim_chembl, goslim_flybase_rib...",False,[Wikipedia:Reproduction],...,[],,,,,[],"{GO:0032504, GO:0032505, GO:0061887, GO:001995...",{GO:0008150},{GO:0000003},"{GO:0030436, GO:0007114, GO:0043936, GO:003250..."
GO:0000005,3,obsolete ribosomal chaperone activity,molecular_function,"""OBSOLETE. Assists in the correct assembly of ...",[],"[""ribosomal chaperone activity"" EXACT []]",[],[],true,[],...,"[GO:0042254, GO:0044183, GO:0051082]",This term was made obsolete because it refers ...,,,,[],{},{},{GO:0000005},{GO:0000005}
GO:0000006,4,high-affinity zinc transmembrane transporter a...,molecular_function,"""Enables the transfer of zinc ions (Zn2+) from...",[GO:0005385 ! zinc ion transmembrane transport...,"[""high affinity zinc uptake transmembrane tran...",[],[],False,[],...,[],,,,,[],{},{GO:0005385},"{GO:0046915, GO:0005385, GO:0022857, GO:000000...",{GO:0000006}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001313,47370,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways involving...","[GO:0006040 ! amino sugar metabolic process, G...","[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],...,[],,,pr,2012-03-22T01:19:54Z,[],"{GO:2001315, GO:2001314}","{GO:0006040, GO:0006793, GO:0009225}","{GO:0034641, GO:0006040, GO:0044238, GO:190113...",{GO:2001313}
GO:2001314,47371,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways resulting...",[GO:0009227 ! nucleotide-sugar catabolic proce...,"[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],...,[],,,pr,2012-03-22T01:20:01Z,[],{},"{GO:0009227, GO:0046348, GO:2001313}","{GO:0034641, GO:0046348, GO:0009227, GO:000680...",{GO:2001314}
GO:2001315,47372,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological_process,"""The chemical reactions and pathways resulting...",[GO:0009226 ! nucleotide-sugar biosynthetic pr...,"[""UDP-4-deoxy-4-formamido-beta-L-arabinopyrano...",[],[],False,[],...,[],,,pr,2012-03-22T01:20:05Z,[],{},"{GO:2001313, GO:0009226, GO:0046349}","{GO:0034641, GO:0019438, GO:0046349, GO:004424...",{GO:2001315}
GO:2001316,47373,kojic acid metabolic process,biological_process,"""The chemical reactions and pathways involving...",[GO:0034308 ! primary alcohol metabolic proces...,"[""5-hydroxy-2-(hydroxymethyl)-4H-pyran-4-one m...",[],[],False,[],...,[],,,rfoulger,2012-04-18T09:22:42Z,[],{GO:2001317},"{GO:0042180, GO:0046483, GO:1901360, GO:0034308}","{GO:2001316, GO:0006066, GO:0071704, GO:190161...",{GO:2001316}


## Parse all Uniref90 sequences and annotations & save into sqlite

> wget ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.xml.gz

Maybe also:

> wget "https://www.uniprot.org/uniref/?query=uniprot:(goa:(evidence:manual))%20identity:0.9&format=rdf&force=true&compress=yes" -O uniref90_reviewied_annotations.xml.gz

In [10]:
with gzip.open('/cs/phd/nadavb/cafa_project/data/uniref90.xml.gz', 'rt') as f:
    print(f.read(600))

<?xml version="1.0" encoding="ISO-8859-1" ?>
<UniRef90 xmlns="http://uniprot.org/uniref" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd" 
 releaseDate="2019-11-13" version="2019_10"> 
<entry id="UniRef90_A0A5A9P0L4" updated="2019-11-13">
<name>Cluster: Titin</name>
<property type="member count" value="1"/>
<property type="common taxon" value="Triplophysa tibetana"/>
<property type="common taxon ID" value="1572043"/>
<representativeMember>
<dbReference type="UniProtKB ID" id="A0A5A9P0L4_9TELE">


In [None]:
# Are you sure?!
# raise Exception()

CHUNK_SIZE = 100000

SQLITE_FILE_PATH = '/cs/phd/nadavb/cafa_project/data/protein_annotations.db'

NAMESPACE_PREFIX = '{http://uniprot.org/uniref}'
NAMESPACES = {'uniprot': r'http://uniprot.org/uniref'}

GO_ANNOTATION_CATEGORIES = [
    'GO Molecular Function',
    'GO Biological Process',
    'GO Cellular Component',
]

go_annotation_to_all_ancestors = go_annotations_meta['all_ancestors'].to_dict()
go_id_to_index = go_annotations_meta['index'].to_dict()
go_index_to_id = go_annotations_meta.reset_index().set_index('index')['id'].to_dict()

os.remove(SQLITE_FILE_PATH)
sqlite_conn = sqlite3.connect(SQLITE_FILE_PATH)

chunk_indices = []
chunk_records = []

go_index_record_counter = Counter()
n_records_with_any_go_annotation = 0

def extract_go_category(entry, category):
    return list({property_element.attrib['value'] for property_element in entry.xpath(r'uniprot:property[@type="%s"]' % \
            category, namespaces = NAMESPACES)})

def get_go_annotation_all_ancestors(annotation):
    
    global go_annotation_to_all_ancestors
    
    if annotation in go_annotation_to_all_ancestors:
        return go_annotation_to_all_ancestors[annotation]
    else:
        print('Unrecognized GO annotation (will be ignored): %s' % annotation)
        return set()

def get_complete_go_annotations(go_annotations):
    return set.union(set(), *[get_go_annotation_all_ancestors(annotation) for annotation in go_annotations])

def get_complete_go_annotation_indices(go_annotations):
    complete_go_annotations = get_complete_go_annotations(go_annotations)
    return list(sorted(filter(None, map(go_id_to_index.get, go_annotations))))

def save_current_chunk():
    
    global sqlite_conn, chunk_indices, chunk_records, go_index_record_counter, n_records_with_any_go_annotation
    
    print('Saving chunk...')
    
    chunk_records = pd.DataFrame(chunk_records, columns = ['tax_id', 'uniprot_name', 'go_annotations'], \
            index = chunk_indices)
    
    chunk_records['flat_go_annotations'] = chunk_records['go_annotations'].apply(\
        lambda go_annotations: list(sorted(set.union(*map(set, go_annotations.values())))))
    chunk_records['n_go_annotations'] = chunk_records['flat_go_annotations'].apply(len)
    chunk_records['complete_go_annotation_indices'] = chunk_records['flat_go_annotations'].apply(\
            get_complete_go_annotation_indices)
    chunk_records['n_complete_go_annotations'] = chunk_records['complete_go_annotation_indices'].apply(len)
    n_records_with_any_go_annotation += (chunk_records['n_complete_go_annotations'] > 0).sum()
    
    for complete_go_annotation_indices in chunk_records['complete_go_annotation_indices']:
        go_index_record_counter.update(complete_go_annotation_indices)

    chunk_records['go_annotations'] = chunk_records['go_annotations'].apply(json.dumps)
    chunk_records['flat_go_annotations'] = chunk_records['flat_go_annotations'].apply(json.dumps)
    chunk_records['complete_go_annotation_indices'] = chunk_records['complete_go_annotation_indices'].apply(json.dumps)
    chunk_records.to_sql('protein_annotations', sqlite_conn, if_exists = 'append')
    
    chunk_indices = []
    chunk_records = []

def process_entry(i, event, entry):
    
    global chunk_indices, chunk_records
    
    if i % 1000 == 0:
        print(i, end = '\r')

    repr_member, = entry.xpath(r'uniprot:representativeMember', namespaces = NAMESPACES)
    db_ref, = repr_member.xpath(r'uniprot:dbReference', namespaces = NAMESPACES)
    protein_name = db_ref.attrib['id']

    try:
        taxonomy_element, = db_ref.xpath(r'uniprot:property[@type="NCBI taxonomy"]', namespaces = NAMESPACES)
        tax_id = int(taxonomy_element.attrib['value'])
    except:
        tax_id = np.nan

    extracted_go_annotations = {category: extract_go_category(entry, category) for category in GO_ANNOTATION_CATEGORIES}
    
    chunk_indices.append(i)
    chunk_records.append((tax_id, protein_name, extracted_go_annotations))
    
    if len(chunk_records) >= CHUNK_SIZE:
        save_current_chunk()
        
with gzip.open('/cs/phd/nadavb/cafa_project/data/uniref90.xml.gz', 'rb') as f:
    context = etree.iterparse(f, tag = NAMESPACE_PREFIX + 'entry', events = ('end',))
    etree_fast_iter(context, process_entry, max_elements = None)
    
if len(chunk_records) > 0:
    save_current_chunk()
        
print('Done.')

Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Unrecognized GO annotation (will be ignored): GO:1990886
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Saving chunk...
Unrecognized GO annotation (will be ignored): GO:1990886
Saving chunk...
Unrecognized GO annotation (will be ignored): GO:1990886
Saving chunk...
Saving chunk...
Saving chunk...
Unrecognized GO annotation (will be ignored): GO:1990886
Saving chunk...
Saving chunk...
Unrecognized GO annotation (will be ignored): GO:1990886
Unrecognized GO annotation (will be ignored): GO:1990886
Saving chunk...
Saving chunk...
Saving chunk...
Unrecognized GO annotation (will be ignored): GO:1990886
Saving chunk...
Saving chunk...
Unrecognized GO annotation (will be ignored): GO:1990886


In [None]:
print(n_records_with_any_go_annotation)

In [None]:
# Are you sure?!
raise Exception()

go_id_record_counter = pd.Series(go_index_record_counter)
go_id_record_counter.index = [go_index_to_id[index] for index in go_id_record_counter.index]

go_annotations_meta['count'] = go_id_record_counter.reindex(go_annotations_meta.index).fillna(0)
go_annotations_meta['freq'] = go_annotations_meta['count'] / n_records_with_any_go_annotation
display(go_annotations_meta)

go_annotations_meta.to_csv('/cs/labs/michall/nadavb/cafa_project/data/go_annotations.csv')