## keywords parsing

In [3]:
import re
import mmap
from pprint import pprint
import json
from pprint import pprint

import pandas as pd

In [142]:
import mmap

class Keyword():
    def __init__(self, kw): self.__dict__.update(kw)
    def get_name(self):     return self.ID if 'ID' in self.__dict__ else self.IC
    def get_category(self): return None if self.is_category() else self.CA
    def is_category(self):  return 'IC' in self.__dict__
    def is_keyword(self):   return 'ID' in self.__dict__
    
    def __getattr__(self, attr): return self.__dict__.get(attr, None)
    def __repr__(self): return repr(self.__dict__)
    def __str__(self): return repr(self)

def parse_keywlist(path):

    kw = re.compile(rb'^(ID|IC) \s+ ([^.]+) .\s+  AC \s+ (KW-\d{4}) \s+ (.+?)^//', re.X | re.M | re.S)
    kw_split_re = re.compile('\s*[;:]\s*')

    kw_names = {}
    kw_acs = {}

    with open(path, 'rb+') as file:
        mem = mmap.mmap(file.fileno(), 0)

        for m in kw.finditer(mem):
            id_type, id_value, AC, lines = [x.decode() for x in m.groups()]
            rec = Keyword( {id_type:id_value, 'AC': AC, 'GO':[], 'HI':set(), 'WW':[], 'SY': [], 'DE': ''} )
            
            for line in lines.split('\n'):
                k, v = line[:2], line[5:].strip('.')
                
                
                if k=='DE':
                    rec.DE += ' %s'%v
                elif k=='SY':
                    rec.SY = [w.strip() for w in v.split(';') ]
                elif(k=='GO'):
                    rec.GO.append( v[:10] )
                elif(k=='HI'):
                    rec.HI.add( kw_split_re.split(v)[-2] ) # -1 is ID, -2 is predecessor
                elif(k=='CA'):
                    rec.CA = v
                elif(k=='WW'):
                    rec.WW.append(v)

            kw_acs[rec.AC] = rec
            kw_names[ rec.get_name() ] = rec

    return kw_names, kw_acs

kw_names, kw_acs = parse_keywlist('data/IN/keywlist.txt')

Ako get_category() vrati None tj Null to je znak da je kljucna rec kategorija

### export keywrods

In [172]:
kw_df = []
kw_relations_df = []

for kw in kw_names.values():
    # Mogu da koristim dict umesto tuple ali izgubicu redosled a OrderDict je ruznije praviti od dict objekta za ovaj slucaj
    kw_entry = (kw.AC, 'Keyword', kw.AC, kw.get_name(), kw.get_category(), kw.DE, kw.SY, kw.WW)
    kw_df.append(kw_entry)

    kw_relations_df.extend( (kw.AC, kw_names[p].AC, 'hi') for p in kw.HI)
    kw_relations_df.extend( (kw.AC, go, 'map_to') for go in kw.GO)

    
kw_df = pd.DataFrame(kw_df, columns=[':ID', ':LABEL', 'AC', 'name', 'category', 'def', 'synonyms', 'ww']).set_index(':ID')
kw_relations_df = pd.DataFrame(kw_relations_df, columns=[':START_ID', ':END_ID', ':TYPE']).set_index(':START_ID')

In [173]:
kw_df = kw_df[ ~kw_df.category.isnull()]

In [174]:
kw_df.to_csv('data/neo4j/kw.csv')
kw_relations_df.to_csv('data/neo4j/kw_relations.csv')

# Import then export GO

In [5]:
from GO.obo_parser import  *
import json

In [6]:
with open("data/IN/go.obo", "r") as obo_file:
    terms = parser.parse(obo_file.read())

In [7]:
with open("data/IN/go.json", "w") as obo_file:
    json.dump([x.__dict__ for x in terms],obo_file, indent=4, )

In [8]:
with open("data/IN/go.json", "r") as obo_file:
    terms = [Term(t['id'], t) for t in json.load(obo_file)]

In [11]:
term_df = []
term_relations_df = []

existing_go_set = set(t.id for t in terms)

for t in terms:
    # Mogu da koristim dict umesto tuple ali izgubicu redosled a OrderDict je ruznije praviti od dict objekta za ovaj slucaj
    term_entry = (t.id, 'Term', t.namespace_short(), t.id, t.name, t.namespace, t.is_obsolete, t.comment, t.definition[0], t.xref)
    term_df.append(term_entry)

    if t.replaced_by in existing_go_set:
        term_relations_df.append( (t.id, t.replaced_by, 'replaced_by') )
    term_relations_df.extend( (t.id, go, rt) for rt in ['is_a', 'consider'] for go in t.__dict__.get(rt) or [] if go in existing_go_set)
    term_relations_df.extend( (t.id, go, rt) for rt, go in t.relationship if go in existing_go_set)
    
term_df = pd.DataFrame(term_df, columns=[':ID', ':LABEL', ':LABEL', 'id', 'name', 'namespace', 'is_obsolete:boolean',
                                         'comment', 'def', 'xref'] ).set_index(':ID')
term_relations_df = pd.DataFrame(term_relations_df, columns=[':START_ID', ':END_ID', ':TYPE']).set_index(':START_ID')


In [17]:
term_df.to_csv('data/neo4j/term.csv')
term_relations_df.to_csv('data/neo4j/term_relations.csv')