In [1]:
import requests
import json

# Input:
* protein: Amyloid-beta precursor protein P05067
* range of the protein's amino acid sequence to highlight in associated structure:

In [2]:
highlight = [290,300]

# Output:
* the corresponding highlight range for each of the protein's associated structures (which will eventually be highlighted in Mol*)

First get the associated structures:

In [3]:
P05067 = json.loads(requests.get('https://wwwdev.ebi.ac.uk/uniprot/api/uniprotkb/accession/P05067').text)
pdb_entries = [xref for xref in P05067['uniProtKBCrossReferences'] if xref['database'] == 'PDB']
pdb_entries[:3]

[{'database': 'PDB',
  'id': '1AAP',
  'properties': [{'key': 'Method', 'value': 'X-ray'},
   {'key': 'Resolution', 'value': '1.50 A'},
   {'key': 'Chains', 'value': 'A/B=287-344'}]},
 {'database': 'PDB',
  'id': '1AMB',
  'properties': [{'key': 'Method', 'value': 'NMR'},
   {'key': 'Resolution', 'value': '-'},
   {'key': 'Chains', 'value': 'A=672-699'}]},
 {'database': 'PDB',
  'id': '1AMC',
  'properties': [{'key': 'Method', 'value': 'NMR'},
   {'key': 'Resolution', 'value': '-'},
   {'key': 'Chains', 'value': 'A=672-699'}]}]

The first structure (1AAP) has two chains A/B which use the positions 287-344 from the amino acid sequence and are within the highlight range. PDBe provides mappings (as assigned by the SIFTS process) from PDB structures to UniProt so fetch this data for 1AAP:

In [4]:
j = json.loads(requests.get('https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/1AAP').text)
j

{'1aap': {'UniProt': {'P05067': {'identifier': 'A4_HUMAN',
    'name': 'A4_HUMAN',
    'mappings': [{'entity_id': 1,
      'chain_id': 'A',
      'start': {'author_residue_number': 1,
       'author_insertion_code': '',
       'residue_number': 1},
      'unp_end': 344,
      'unp_start': 287,
      'end': {'author_residue_number': None,
       'author_insertion_code': '',
       'residue_number': 58},
      'struct_asym_id': 'A'},
     {'entity_id': 1,
      'chain_id': 'B',
      'start': {'author_residue_number': 1,
       'author_insertion_code': '',
       'residue_number': 1},
      'unp_end': 344,
      'unp_start': 287,
      'end': {'author_residue_number': None,
       'author_insertion_code': '',
       'residue_number': 58},
      'struct_asym_id': 'B'}]}}}}

In [5]:
mappings = list(j.values())[0]['UniProt']['P05067']['mappings']
mappings

[{'entity_id': 1,
  'chain_id': 'A',
  'start': {'author_residue_number': 1,
   'author_insertion_code': '',
   'residue_number': 1},
  'unp_end': 344,
  'unp_start': 287,
  'end': {'author_residue_number': None,
   'author_insertion_code': '',
   'residue_number': 58},
  'struct_asym_id': 'A'},
 {'entity_id': 1,
  'chain_id': 'B',
  'start': {'author_residue_number': 1,
   'author_insertion_code': '',
   'residue_number': 1},
  'unp_end': 344,
  'unp_start': 287,
  'end': {'author_residue_number': None,
   'author_insertion_code': '',
   'residue_number': 58},
  'struct_asym_id': 'B'}]

Calculate the offset:

In [6]:
mapping = mappings[0]
offset = mapping['unp_start'] - mapping['start']['residue_number']
offset

286

Subtract the offset from the protein highlight range to get the final range to highlight in the structure:

In [7]:
mapped_highlight = [el - offset for el in highlight]
mapped_highlight

[4, 14]

# Double check:
Ensure that the protein's sequence highlight range and the structure's sequence are the same.

First get the protein sequence:

In [8]:
r = requests.get('https://www.uniprot.org/uniprot/P05067.fasta').text
A4 = ''.join(r.split('\n')[1:])
A4

'MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMNVQNGKWDSDPSGTKTCIDTKEGILQYCQEVYPELQITNVVEANQPVTIQNWCKRGRKQCKTHPHFVIPYRCLVGEFVSDALLVPDKCKFLHQERMDVCETHLHWHTVAKETCSEKSTNLHDYGMLLPCGIDKFRGVEFVCCPLAEESDNVDSADAEEDDSDVWWGGADTDYADGSEDKVVEVAEEEEVAEVEEEEADDDEDDEDGDEVEEEAEEPYEEATERTTSIATTTTTTTESVEEVVREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSAMSQSLLKTTQEPLARDPVKLPTTAASTPDAVDKYLETPGDENEHAHFQKAKERLEAKHRERMSQVMREWEEAERQAKNLPKADKKAVIQHFQEKVESLEQEAANERQQLVETHMARVEAMLNDRRRLALENYITALQAVPPRPRHVFNMLKKYVRAEQKDRQHTLKHFEHVRMVDPKKAAQIRSQVMTHLRVIYERMNQSLSLLYNVPAVAEEIQDEVDELLQKEQNYSDDVLANMISEPRISYGNDALMPSLTETKTTVELLPVNGEFSLDDLQPWHSFGADSVPANTENEVEPVDARPAADRGLTTRPGSGLTNIKTEEISEVKMDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHHGVVEVDAAVTPEERHLSKMQQNGYENPTYKFFEQMQN'

Then get the sequence for the structure:

In [9]:
r = requests.get('https://www.rcsb.org/fasta/entry/1AAP/display').text
AAP = r.split('\n')[1]
AAP

'VREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSA'

In [10]:
highlight_protein = A4[highlight[0]: highlight[1]]
highlight_protein

'CSEQAETGPC'

In [11]:
highlight_structure = AAP[mapped_highlight[0]:mapped_highlight[1]]
highlight_structure

'CSEQAETGPC'

Make sure they are equal:

In [12]:
assert highlight_protein == highlight_structure