In [1]:
import os
os.chdir("../../master")

In [2]:
import pandas as pd

In [3]:
import sqlalchemy as sa
import models
import importlib

engine = sa.create_engine("postgresql://goksi:124@localhost/master", echo=False)
Session = sa.orm.sessionmaker(bind=engine)
session = Session()

importlib.reload(models)
from models import *
from helper_functions import *
Base.metadata.create_all(engine)

def sql(sql:str, no_index=False):
    try:
        df = pd.read_sql_query(sql, engine)
        return df if no_index else df.set_index(df.columns[0])
    except sa.exc.OperationalError as e:
        print(str(e).replace(r'\n', '\n'))

# Parse go.obo

In [4]:
from GO.obo_parser import *
with open("data/go.obo") as obo:
    terms_list = parser.parse(obo.read())
terms_dict = { t.id: t for t in terms_list}

In [5]:
seq_df = sql("""
select distinct prot_id as ":ID"
              , 'Seq' as ":LABEL"
              , prot_id as "id"
              , length
              , is_disordered
from go_pred_view 
where 
    true
    and predictor_name = 'VSL2b'
    and length >= 40
""")

seq_df.to_csv("./data/seq.csv")
seq_df.head()

Unnamed: 0_level_0,:LABEL,id,length,is_disordered
:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A0A060X6Z0,Seq,A0A060X6Z0,489,True
A0A068FIK2,Seq,A0A068FIK2,1033,True
A0A075F932,Seq,A0A075F932,421,True
A0A078CGE6,Seq,A0A078CGE6,1299,True
A0A086F3E3,Seq,A0A086F3E3,192,False


In [9]:
seq_to_go = sql("""
select distinct prot_id as ":START_ID"
              , go as ":END_ID"
              , 'anotate_to' as ":TYPE"
from go_pred_view 
where 
    true
    and predictor_name = 'VSL2b'
    and length >= 40
""")
seq_to_go[":END_ID"] = seq_to_go[":END_ID"].apply(lambda x: "GO:%07d"%x)

seq_to_go = seq_to_go[seq_to_go[":END_ID"].apply(lambda x: x in term_set)]
seq_to_go.to_csv("./data/seq_rel.csv")
seq_to_go.head()

Unnamed: 0_level_0,:END_ID,:TYPE
:START_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
A0A060X6Z0,GO:0004511,anotate_to
A0A060X6Z0,GO:0043204,anotate_to
A0A060X6Z0,GO:0070852,anotate_to
A0A068FIK2,GO:0005737,anotate_to
A0A068FIK2,GO:0055028,anotate_to


In [1]:
import re

def unescape_string(s:str):
    if s == None:
        return None
    escaped_re = re.compile(r"\\(.)")
    return re.sub(escaped_re, lambda match: match.group(1), s)

# Export GO

In [3]:
term_df = []
term_set = set()
relations_df = []
for t in terms_list:
    term_set.add(t.id)
    attrs = [t.id, "Term", t.namespace_short(), t.id, t.name, t.namespace_short(), t.is_obsolete, t.replaced_by, t.comment, unescape_string(t.definition[0])]
    term_df.append( attrs )
    for id in t.is_a:
        relations_df.append( [t.id, id, "is_a"] )
    for _type, _id in t.relationship:
        relations_df.append( [t.id, _id, _type] )
    
    if t.replaced_by:
        relations_df.append( [t.id, t.replaced_by, "replaced_by"] )
        
    for consider_id in t.consider:
        relations_df.append( [t.id, consider_id, "consider"] )

term_df = pd.DataFrame(term_df, columns=[":ID", ":LABEL", ":LABEL", "id", "name", "namespace", "is_obsolete:boolean", "replaced_by", "comment", "def"])
term_df = term_df.set_index([':ID'])
term_df.to_csv("data/go_term.csv")


relations_df = pd.DataFrame(relations_df, columns=[":START_ID", ":END_ID", ":TYPE"])
relations_df = relations_df.set_index([":START_ID", ":END_ID"])
relations_df.to_csv("data/go_is_a.csv")

NameError: name 'terms_list' is not defined

# Parse & Export kewrods

## 1. Parse

In [12]:
from collections import OrderedDict


def makeEmptyKeyw():
    empty_keyw = OrderedDict()
    empty_keyw['ID']= [] # Identifier (keyword)           Once; starts a keyword entry
    empty_keyw['IC']= [] # Identifier (category)          Once; starts a category entry
    empty_keyw['AC']= [] # Accession (KW-xxxx)            Once
    empty_keyw['DE']= [] # Definition                     Once or more
    empty_keyw['SY']= [] # Synonyms                       Optional; once or more
    empty_keyw['GO']= [] # Gene ontology (GO) mapping     Optional; once or more
    empty_keyw['HI']= [] # Hierarchy                      Optional; once or more
    empty_keyw['WW']= [] # Relevant WWW site              Optional; once or more
    empty_keyw['CA']= [] # Category                       Once per keyword entry; absent in category entries
    return empty_keyw


def onceOrNone(keyw, k):
    if k in keyw:
        keyw[k] = keyw[k][0]
    
flatten = lambda l: [item for sublist in l for item in sublist]

def makeKeyw(text):
    keyw = makeEmptyKeyw()
    for line in text.split('\n'):
        k, v = line.split(maxsplit=1)
        keyw[k].append(v)
         
    for k in list(keyw.keys()):
        if k == "HI":
            list_of_items =  [x.split(":")[1].rstrip(".").split(";") for x in keyw[k]]
            keyw[k] = { item.strip() for item in flatten(list_of_items) }
            continue
            
        if keyw[k]:
            line = ' '.join(keyw[k]).strip('.')
            keyw[k] = line.split(';')
        else:
            del keyw[k]
            
    onceOrNone(keyw, 'ID')
    onceOrNone(keyw, 'IC')
    onceOrNone(keyw, 'AC')
    onceOrNone(keyw, 'CA')
   
    keyw["DE"] = " ".join(keyw["DE"])
    return keyw

keyword_list = []
keyword_dict = {}
iter = re.findall(r'^((?:ID|IC).+?)\n//', open('data/keywlist.txt').read(), re.S | re.M)
for keyword_text in iter:
    keyword = makeKeyw(keyword_text)
    keyword_list.append(keyword)
    
    if "IC" in keyword:
        keyword_dict[keyword["IC"]] = keyword
    elif "ID" in keyword:
        keyword_dict[keyword["ID"]] = keyword
        


In [13]:
len(keyword_list)

1196

In [36]:
regex = re.compile(r"UniProtKB-KW:(KW-\d{4})\s([^>]+)>[^;]+;\s(GO:.+)")

def getMapping(l):
    """
    AC   Accession (KW-xxxx)
    ID   Identifier (keyword)
    GO   Term id in Gene Ontology
    """
    AC, ID, GO = regex.match(l).groups()
    return GO, AC, ID.strip()

ls = [getMapping(l) for l in open('./data/uniprotkb_kw2go.txt') if l[0] != '!']
go_keyw_mapping = pd.DataFrame(ls, columns=['GO', 'AC', 'ID'])
go_keyw_mapping.set_index('GO').to_csv('./data/uniprotkb_kw2go.csv')
go_keyw_mapping

print(go_keyw_mapping.shape)
go_keyw_mapping

for ac, df in go_keyw_mapping.groupby("AC"):
    for go in df.GO:
        print(ac, '->', go)
    break

(852, 3)
KW-0001 -> GO:0051537


## 2. Eksport 

In [49]:
keyw_df = []
keyw_rel_df = []
keyw_set = set()
keyw_rel_set = set()


for k in keyword_list:
    if not "ID" in k:
        continue
    keyw_set.add(k["AC"])
    keyw_df.append( [ k["AC"], "Keyword", k["AC"],  k["ID"], k["CA"], k["DE"] ] )
    for go in k.get("GO", []):
        if go.startswith("GO:"):
            keyw_rel_df.append( [k["AC"], go, "map_to"] )
            keyw_rel_set.add( (k["AC"], go) )
    
    for k2 in k.get("HI", []):      
        k2 = keyword_dict.get(k2)
        if k2 and k["AC"] != k2["AC"]:
            keyw_rel_df.append( [k["AC"], k2["AC"], "hi"] )
            

for ac, df in go_keyw_mapping.groupby("AC"):
    for go in df.GO:
        if (ac, go) not in keyw_rel_set:
            keyw_rel_df.append( [ac, go, "map_to_extra"] )
        if ac not in keyw_set:
            print("falli: ", ac) # upozori me
    
keyw_df = pd.DataFrame(keyw_df, columns=[":ID", ":LABEL", "id", "name", "category", "def"])
keyw_df = keyw_df.set_index([':ID'])
keyw_df.to_csv("data/keyw.csv")
    
keyw_rel_df = pd.DataFrame(keyw_rel_df, columns=[":START_ID", ":END_ID", ":TYPE"])
keyw_rel_df = keyw_rel_df.set_index([":START_ID", ":END_ID"])
keyw_rel_df.to_csv("data/keyw_rel.csv")