In [11]:
import sys
sys.path.append("..")
from rosalind_tools.config import *
from rosalind_tools.utils import parse_fasta, Record
import re
from typing import TextIO, List
import requests

Given: At most 15 UniProt Protein Database access IDs.

Return: For each protein possessing the N-glycosylation motif, output its given access ID followed by a list of locations in the protein string where the motif can be found.  

In [33]:
# To parse text retrived from url
def parse_text(r: str) -> Record:
    lst = r.split('\n')
    record = {'name': lst[0].lstrip('>'),
             'seq': ''.join(lst[1:])}
    return Record(record)
def find_n_glyco(record: Record) -> List[int]:
    pattern = re.compile(r'(?=(N[^P][ST][^P]))')
    pos = [m.start() + 1 for m in pattern.finditer(record.seq)]
    return pos
def n_glyco_motif(f: TextIO) -> str:
    url = 'http://www.uniprot.org/uniprot/'
    for line in f.readlines():
        line = line.rstrip()
        record_handle = requests.get(url+line+'.fasta').text
        record = parse_text(record_handle)
        pos = list(map(str, find_n_glyco(record)))
        # print seq names only when motif found
        if pos:
            print(line.rstrip())
            print(' '.join(pos))
    return

In [34]:
# Try sample dataset
with open(data_dir/"test_protein_motif.txt", 'r') as f:
    n_glyco_motif(f)

B5ZC00
85 118 142 306 395
P07204_TRBM_HUMAN
47 115 116 382 409
P20840_SAG1_YEAST
79 109 135 248 306 348 364 402 485 501 614


In [35]:
# Try Rosalind dataset
with open(data_dir/"rosalind_mprt.txt", 'r') as f:
    n_glyco_motif(f)

P80370_DLK_HUMAN
100
P00744_PRTZ_BOVIN
59 191 289
Q924A4
74
P47002
35 552 608
P12763_A2HS_BOVIN
99 156 176
Q181G8
13 222 283
P03415_VME1_CVMA5
27
P04233_HG2A_HUMAN
130 136 256 270
Q55AB5
6
Q8LCP6
259 464 484
B3ET80
6
P02974_FMM1_NEIGO
67 68 121
P11171_41_HUMAN
258 281 358
