# Keywords parser

Description: Controlled vocabulary of keywords  
Name:        keywlist.txt  
Release:     2017_12 of 20-Dec-2017  
 
---------------------------------------------------------------------------

  This document lists the keywords and categories used in the UniProt
  knowledgebase (Swiss-Prot and TrEMBL). The keywords are classified into
  the following 10 categories:

    * Biological process
    * Cellular component
    * Coding sequence diversity
    * Developmental stage
    * Disease
    * Domain
    * Ligand
    * Molecular function
    * PTM
    * Technical term

  The definition of the keywords and categories usage as well as other
  information is provided in the following format:
```

  ---------  ---------------------------     ------------------------------
  Line code  Content                         Occurrence in an entry
  ---------  ---------------------------     ------------------------------
  ID         Identifier (keyword)            Once; starts a keyword entry
  IC         Identifier (category)           Once; starts a category entry
  AC         Accession (KW-xxxx)             Once
  DE         Definition                      Once or more
  SY         Synonyms                        Optional; once or more
  GO         Gene ontology (GO) mapping      Optional; once or more
  HI         Hierarchy                       Optional; once or more
  WW         Relevant WWW site               Optional; once or more
  CA         Category                        Once per keyword entry;
                                             absent in category entries
  //         Terminator                      Once; ends an entry
 ```


In [34]:
import re
import mmap

In [52]:
class Keyword():
    def __init__(self, kw): self.__dict__.update(kw)
    def get_name(self):     return self.ID if 'ID' in self.__dict__ else self.IC
    def get_category(self): return None if self.is_category() else self.CA
    def get_category_name(self): return None if self.is_category() else self.category_name


    def is_category(self):  return 'IC' in self.__dict__
    def is_keyword(self):   return not self.is_category()
    
    def __getattr__(self, attr): return self.__dict__.get(attr, None)
    def __repr__(self): return repr(self.__dict__)
    def __str__(self): return repr(self)

In [51]:
def parse_keywlist(path):

    kw_re = re.compile(rb'^(ID|IC) \s+ ([^.]+) .\s+  AC \s+ (KW-\d{4}) \s+ (.+?)^//', re.X | re.M | re.S)
    kw_split_re = re.compile('\s*[;:]\s*')

    kw_list = []

    with open(path, 'rb+') as file:
        mem = mmap.mmap(file.fileno(), 0)

        for m in kw_re.finditer(mem):
            id_type, id_value, AC, lines = [x.decode() for x in m.groups()]
            rec = Keyword( {id_type:id_value, 'AC': AC, 'GO':[], 'HI':set(), 'WW':[], 'SY': [], 'DE': ''} )
            
            for line in lines.split('\n'):
                k, v = line[:2], line[5:].strip('.')
                
                
                if k=='DE':
                    rec.DE += ' %s'%v
                elif k=='SY':
                    rec.SY = [w.strip() for w in v.split(';') ]
                elif(k=='GO'):
                    rec.GO.append( v[:10] )
                elif(k=='HI'):
                    rec.HI.add( kw_split_re.split(v)[-2] ) # -1 is ID, -2 is predecessor
                elif(k=='CA'):
                    rec.CA = v
                elif(k=='WW'):
                    rec.WW.append(v)

            kw_list.append(rec)
            
    kw_name = {rec.get_name():rec for rec in kw_list}
    for rec in kw_list:
        rec.HI = [kw_name[x].AC for x in rec.HI]
        rec.category_name = rec.CA
        rec.CA = None if rec.is_category() else kw_name[rec.CA].AC

    return kw_list