# Biomedical Data Bases, 2020-2021
###  Create Your Own Database
These are the notes by prof. Davide Salomoni (d.salomoni@unibo.it) for the Biomedical Data Base course at the University of Bologna, academic year 2020-2021.

## Reinstall the redis module

Remember that you should have already started the Redis _with persistence_.

In [2]:
! pip install redis

Collecting redis
  Using cached redis-3.5.3-py2.py3-none-any.whl (72 kB)
Installing collected packages: redis
Successfully installed redis-3.5.3


In [82]:
import redis
r = redis.Redis(host="my_redis")

## Verify how to map a Python dictionary to a Redis hash

In [83]:
# create a test Python dictionary
my_dict = {'one': 1, 'two': 2, 'three': 3, 'four': 4}

# create the hash "numbers" in redis
r.hset('numbers', mapping=my_dict)

# get the hash back from redis as a python dictionary
new_dict = r.hgetall('numbers')
print(new_dict)

{b'one': b'1', b'two': b'2', b'three': b'3', b'four': b'4'}


In [84]:
# find all keys in the DB matching the expression '*umb*'
my_keys = r.keys('*umb*')
print(my_keys)

[b'numbers']


In [85]:
# delete the key 'numbers' from Redis
r.delete('numbers')

# confirm it is now deleted
print(r.hgetall('numbers'))

{}


## Query PDB, Uniprot and store the results in Redis

Refer to the slides for details about the data model.

In [86]:
import requests
pdb_query = '''
{
  entries(entry_ids: ["4GYD", "1TU2"]) {
    entry {
      id
    }
    rcsb_entry_info {
      molecular_weight
      deposited_atom_count
      deposited_modeled_polymer_monomer_count
    }
    polymer_entities {
      rcsb_entity_source_organism {
        ncbi_scientific_name
      }
      uniprots {
        rcsb_uniprot_container_identifiers {
          uniprot_id
        }
        rcsb_uniprot_protein {
          name {
            value
          }
        }
      }
    }
  }
}
'''
# get the PDB data with GraphQL
p = requests.get('https://data.rcsb.org/graphql?query=%s' % requests.utils.requote_uri(pdb_query))
j = p.json()

In [87]:
# explore how the returned data looks like:
# it is a set of nested Python data structures
# we will need to extract the values we need
j['data']

{'entries': [{'entry': {'id': '4GYD'},
   'rcsb_entry_info': {'molecular_weight': 58.57,
    'deposited_atom_count': 4598,
    'deposited_modeled_polymer_monomer_count': 516},
   'polymer_entities': [{'rcsb_entity_source_organism': [{'ncbi_scientific_name': 'Nostoc sp. PCC 7120 = FACHB-418'}],
     'uniprots': [{'rcsb_uniprot_container_identifiers': {'uniprot_id': 'P0A3X7'},
       'rcsb_uniprot_protein': {'name': {'value': 'Cytochrome c6'}}}]}]},
  {'entry': {'id': '1TU2'},
   'rcsb_entry_info': {'molecular_weight': 39.04,
    'deposited_atom_count': 2747,
    'deposited_modeled_polymer_monomer_count': 359},
   'polymer_entities': [{'rcsb_entity_source_organism': [{'ncbi_scientific_name': 'Nostoc sp. PCC 7119'}],
     'uniprots': [{'rcsb_uniprot_container_identifiers': {'uniprot_id': 'P46444'},
       'rcsb_uniprot_protein': {'name': {'value': 'Plastocyanin'}}}]},
    {'rcsb_entity_source_organism': [{'ncbi_scientific_name': 'Nostoc sp. PCC 7119'}],
     'uniprots': [{'rcsb_uniprot_co

In [88]:
# for example, extract some macromolecule parameters
for prot in (j['data']['entries']):
    # each entry corresponds to a single PDB ID
    print("ID : ", prot['entry']['id'])
    print("Macromolecule parameters:")
    print("  molecular weight (kDa); ", prot['rcsb_entry_info']['molecular_weight'])

ID :  4GYD
Macromolecule parameters:
  molecular weight (kDa);  58.57
ID :  1TU2
Macromolecule parameters:
  molecular weight (kDa);  39.04


In [161]:
# extract data and update the Redis database
# let's start with a clean database (WARNING: THIS WILL DELETE ALL EXISTING ENTRIES)
r.flushall()
# the print() statements below are for explanatory purposes
for protein in j['data']['entries']:
    # parameters at the individual PDB entry level
    pdb_id = protein['entry']['id']
    print("PDB:", pdb_id)
    weight = protein['rcsb_entry_info']['molecular_weight']
    atom_count = protein['rcsb_entry_info']['deposited_atom_count']
    residue_count = protein['rcsb_entry_info']['deposited_modeled_polymer_monomer_count']
    # store an entry (a hash) with the parameters above in Redis
    # the key will be the PDB ID
    pdb_dict = {'weight': weight, 'atom_count': atom_count, 'residue_count': residue_count}
    r.hset(pdb_id, mapping=pdb_dict)
    # update the PDB index
    r.sadd('PDB:index', pdb_id)
    for polymer in protein['polymer_entities']:
        # parameters for the polymers
        source_name = polymer['rcsb_entity_source_organism'][0]['ncbi_scientific_name']
        for uprot in polymer['uniprots']:
            # uniprot-related data
            uprot_id = uprot['rcsb_uniprot_container_identifiers']['uniprot_id']
            uprot_name = uprot['rcsb_uniprot_protein']['name']['value']
            print("Uniprot:", uprot_id, source_name, uprot_name)
            # store an entry (a hash) with the source_name and uprot_name in Redis
            # the key will be PDB_ID:UNIPROT_ID
            key = '%s:%s' % (pdb_id, uprot_id)
            r.hset(key, 'organism', source_name)
            r.hset(key, 'name', uprot_name)
            # update the Uniprot index
            r.sadd('UNIPROT:index', uprot_id)
            # call the uniprot REST API looking up uprot_id
            uniprot_url = 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=10&accession=%s' % uprot_id
            u = requests.get(uniprot_url, headers={"Accept" : "application/json"})
            # the Gene Ontology information is stored in the 'dbReferences' structure (see slides)
            db_info = u.json()[0]['dbReferences']
            for db in db_info:
                if db['type'] == 'GO':
                    # it is a Gene Ontology entry
                    go_id = db['id']
                    go_term = db['properties']['term']
                    go_source = db['properties']['source']
                    print(go_id, go_term, go_source)
                    # store an entry (a hash) with GO info in Redis
                    # the key will be PDB_ID:UNIPROT_ID:GO_ID
                    key = '%s:%s:%s' % (pdb_id, uprot_id, go_id)
                    go_dict = {'go_term': go_term, 'go_source': go_source}
                    r.hset(key, mapping=go_dict)


PDB: 4GYD
Uniprot: P0A3X7 Nostoc sp. PCC 7120 = FACHB-418 Cytochrome c6
GO:0031977 C:thylakoid lumen IEA:UniProtKB-SubCell
GO:0009055 F:electron transfer activity IEA:UniProtKB-UniRule
GO:0020037 F:heme binding IEA:InterPro
GO:0005506 F:iron ion binding IEA:InterPro
GO:0015979 P:photosynthesis IEA:UniProtKB-UniRule
PDB: 1TU2
Uniprot: P46444 Nostoc sp. PCC 7119 Plastocyanin
GO:0042651 C:thylakoid membrane IEA:UniProtKB-SubCell
GO:0005507 F:copper ion binding IEA:UniProtKB-UniRule
GO:0009055 F:electron transfer activity IEA:UniProtKB-UniRule
Uniprot: Q93SW9 Nostoc sp. PCC 7119 Cytochrome f
GO:0031361 C:integral component of thylakoid membrane IEA:InterPro
GO:0009055 F:electron transfer activity IEA:UniProtKB-UniRule
GO:0020037 F:heme binding IEA:InterPro
GO:0005506 F:iron ion binding IEA:InterPro
GO:0015979 P:photosynthesis IEA:UniProtKB-UniRule


In [156]:
# all characteristics of a given PDB ID:
r.hgetall('4GYD')

{b'weight': b'58.57', b'atom_count': b'4598', b'residue_count': b'516'}

In [157]:
# all PDB IDs stored in the database:
k = r.smembers('PDB:index')
print(k)

{b'1TU2', b'4GYD'}


In [158]:
# all Uniprot IDs stored in the database:
k = r.smembers('UNIPROT:index')
print(k)

{b'Q93SW9', b'P0A3X7', b'P46444'}


In [159]:
# all GO entries for a certain Uniprot ID:
k = r.keys('*:Q93SW9:GO:*')
print(k)

[b'1TU2:Q93SW9:GO:0031361', b'1TU2:Q93SW9:GO:0009055', b'1TU2:Q93SW9:GO:0005506', b'1TU2:Q93SW9:GO:0015979', b'1TU2:Q93SW9:GO:0020037']


In [162]:
# all information about a certain Uniprot ID
# and all information about its GO entries
print(r.hgetall('1TU2:Q93SW9'))
for k in r.keys('1TU2:Q93SW9:*'):
    print(k, r.hgetall(k))

{b'organism': b'Nostoc sp. PCC 7119', b'name': b'Cytochrome f'}
b'1TU2:Q93SW9:GO:0031361' {b'go_term': b'C:integral component of thylakoid membrane', b'go_source': b'IEA:InterPro'}
b'1TU2:Q93SW9:GO:0009055' {b'go_term': b'F:electron transfer activity', b'go_source': b'IEA:UniProtKB-UniRule'}
b'1TU2:Q93SW9:GO:0005506' {b'go_term': b'F:iron ion binding', b'go_source': b'IEA:InterPro'}
b'1TU2:Q93SW9:GO:0015979' {b'go_term': b'P:photosynthesis', b'go_source': b'IEA:UniProtKB-UniRule'}
b'1TU2:Q93SW9:GO:0020037' {b'go_term': b'F:heme binding', b'go_source': b'IEA:InterPro'}


In [180]:
# after a restart of the Redis database, verify that we still have the entries
# note that in Redis by default entries are stored as "bytes"
# so before processing the return values we decode them to string
r = redis.Redis(host="my_redis")
for pdb in r.smembers('PDB:index'):
    values = {k.decode('utf-8'):v.decode('utf-8') for k,v in r.hgetall(pdb).items()}
    print("PDB ID:", pdb.decode('utf-8'))
    print("  molecular weight (kDa):", values['weight'])
    print("  atom count:", values['atom_count'])
    print("  residue count:", values['residue_count'])

PDB ID: 1TU2
  molecular weight (kDa): 39.04
  atom count: 2747
  residue count: 359
PDB ID: 4GYD
  molecular weight (kDa): 58.57
  atom count: 4598
  residue count: 516
