In [2]:
import pandas as pd
import re
import json

from collections import Counter

### Parsiranje i obrada UniprotKB kljucnih reci (keywords)

In [5]:
regex = re.compile(r"UniProtKB-KW:(KW-\d{4})\s([^>]+)>[^;]+;\sGO:(.+)")

def getMapping(l):
    """
    AC   Accession (KW-xxxx)
    ID   Identifier (keyword)
    GO   Term id in Gene Ontology
    """
    AC, ID, GO = regex.match(l).groups()
    return int(GO), AC, ID.strip()

ls = [getMapping(l) for l in open('./data/uniprotkb_kw2go.txt') if l[0] != '!']
go_keyw_mapping = pd.DataFrame(ls, columns=['GO', 'AC', 'ID'])
go_keyw_mapping.set_index('GO').to_csv('./data/uniprotkb_kw2go.csv')
go_keyw_mapping

print(go_keyw_mapping.shape)
go_keyw_mapping

(852, 3)


Unnamed: 0,GO,AC,ID
0,51537,KW-0001,2Fe-2S
1,51538,KW-0003,3Fe-4S
2,51539,KW-0004,4Fe-4S
3,6754,KW-0066,ATP synthesis
4,5524,KW-0067,ATP-binding
5,9688,KW-0937,Abscisic acid biosynthesis
6,9738,KW-0938,Abscisic acid signaling pathway
7,45151,KW-0005,Acetoin biosynthesis
8,45150,KW-0006,Acetoin catabolism
9,30550,KW-0008,Acetylcholine receptor inhibitor


###  Postoje duplikati

In [3]:
go_n, ac_n = len( set(go_keyw_mapping.GO)), len( set(go_keyw_mapping.AC))
print(f"broj go:{go_n}, broj ac:{ac_n}")

broj go:803, broj ac:808


In [4]:
duplicated_GO = [go for go, count in Counter(go_keyw_mapping.GO).items() if count > 1]
duplicated_GO = go_keyw_mapping[ go_keyw_mapping.GO.isin(duplicated_GO) ]
duplicated_GO = duplicated_GO.sort_values('GO')

print(duplicated_GO.shape)
duplicated_GO.head()

(86, 3)


Unnamed: 0,GO,AC,ID
320,3677,KW-0371,Homeobox
201,3677,KW-0238,DNA-binding
193,3677,KW-0230,DNA invertase
515,3823,KW-0502,Monoclonal antibody
366,3823,KW-0394,Immunoglobulin V region


In [5]:
duplicated_AC = [ac for ac, count in Counter(go_keyw_mapping.AC).items() if count > 1]
duplicated_AC = go_keyw_mapping[ go_keyw_mapping.AC.isin(duplicated_AC) ]

print(duplicated_AC.shape)
duplicated_AC.head()


(82, 3)


Unnamed: 0,GO,AC,ID
22,3824,KW-0021,Allosteric enzyme
23,8152,KW-0021,Allosteric enzyme
70,3824,KW-0081,Bacteriolytic enzyme
71,8152,KW-0081,Bacteriolytic enzyme
72,19835,KW-0081,Bacteriolytic enzyme


### UniprotKB Keywords info file 

In [6]:
from collections import OrderedDict


def makeEmptyKeyw():
    empty_keyw = OrderedDict()
    empty_keyw['ID']= [] # Identifier (keyword)           Once; starts a keyword entry
    empty_keyw['IC']= [] # Identifier (category)          Once; starts a category entry
    empty_keyw['AC']= [] # Accession (KW-xxxx)            Once
    empty_keyw['DE']= [] # Definition                     Once or more
    empty_keyw['SY']= [] # Synonyms                       Optional; once or more
    empty_keyw['GO']= [] # Gene ontology (GO) mapping     Optional; once or more
    empty_keyw['HI']= [] # Hierarchy                      Optional; once or more
    empty_keyw['WW']= [] # Relevant WWW site              Optional; once or more
    empty_keyw['CA']= [] # Category                       Once per keyword entry; absent in category entries
    return empty_keyw


def onceOrNone(keyw, k):
    if k in keyw:
        keyw[k] = keyw[k][0]
    

def makeKeyw(text):
    keyw = makeEmptyKeyw()
    for line in text.split('\n'):
        k, v = line.split(maxsplit=1)
        keyw[k].append(v)
         
    for k in list(keyw.keys()):
        if keyw[k]:
            line = ''.join(keyw[k]).strip('.')
            # nisam siguran sta da radim sa HI
            keyw[k] = line if k == 'HI' else line.split(';')
        else:
            del keyw[k]
            
    onceOrNone(keyw, 'ID')
    onceOrNone(keyw, 'IC')
    onceOrNone(keyw, 'AC')
    onceOrNone(keyw, 'CA')
   
    return keyw

In [7]:
keyword_list = []
iter = re.findall(r'^((?:ID|IC).+?)\n//', open('data/keywlist.txt').read(), re.S | re.M)
for keyword_text in iter:
    keyword = makeKeyw(keyword_text)
    keyword_list.append(keyword)

len(keyword_list)

1196

1196 - 10 kategorija = 1186 kljucnih reci

In [13]:
keywlist_go = pd.DataFrame( [ (int(k['GO'][0][3:]), k['AC'], k['ID'], k['CA']) 
                               for k in keyword_list if 'GO' in k ] ,
                           columns=['GO', 'AC', 'ID', 'CA'] )
print(keywlist_go.shape)
keywlist_go.head()

(808, 4)


Unnamed: 0,GO,AC,ID,CA
0,51537,KW-0001,2Fe-2S,Ligand
1,51538,KW-0003,3Fe-4S,Ligand
2,51539,KW-0004,4Fe-4S,Ligand
3,9688,KW-0937,Abscisic acid biosynthesis,Biological process
4,9738,KW-0938,Abscisic acid signaling pathway,Biological process


In [14]:
go_n, ac_n = len( set(keywlist_go.GO)), len( set(keywlist_go.AC))
print(f"broj go:{go_n}, broj ac:{ac_n}")

broj go:770, broj ac:808


In [10]:
combined_mapping = keywlist_go.merge(go_keyw_mapping, on=['GO', 'AC'], how='outer', suffixes=('_1', '_2'))
# razlika u mapirnaju
combined_mapping.ID_1.isnull().sum()

46

In [11]:
tmp =combined_mapping[ combined_mapping.GO.isin(duplicated_GO.GO ) | combined_mapping.AC.isin(duplicated_AC.AC )] 
len(tmp[tmp.ID_1.isnull()])


45

Dakle iz **keywlist.txt** imamo informarciju o 1186 kljucne reci od kojih **808** ima referencu na GO.
- Ukupno **707** razlicitih GO, (AC nema duplikata)


Iz **uniprotkb_kw2go.txt** imamo **853** mapiranja. Ali mapiranje nije injektivno, naime:
- Ukupno **804** razlicitih GO i **809** razlicitih AC 
- **86** duplikata po GO
- **82** duplikata po AC (Accsesion number)


'keywlist.txt' sadrzi ukupno **116** dodatnih mapiranja u odnosu na 'uniprotkb_kw2go.txt'. Ali **53** predstavljaju dupla mapiranja GO ili AC

In [64]:
df = combined_mapping[combined_mapping.CA == 'Molecular function'][['GO', 'AC', 'ID_1']] 
df.columns = ['go', 'AC', 'ID']
df = df.set_index('go')
df.to_csv('data/uniprotkb_kw2go_MF.csv')