In [1]:
from Bio.PDB import PDBList

# Create an instance of the PDBList class
pdb_list = PDBList()

# Specify the PDB ID of the structure you want to download
pdb_id = "4hhb" #zinc finger

# Download the MMCIF file using the retrieve_pdb_file method
pdb_filename = pdb_list.retrieve_pdb_file(pdb_id, pdir="data/PDB_files", file_format="mmCif")

# Print the name of the downloaded file
print(pdb_filename)

Downloading PDB structure '4hhb'...
data/PDB_files\4hhb.cif


In [3]:
import requests

In [5]:
data = requests.get("https://data.rcsb.org/rest/v1/core/entry/4hhb")

In [7]:
data.status_code

200

In [9]:
info_4hhb = data.json()

In [15]:
info_4hhb.keys()

dict_keys(['audit_author', 'cell', 'citation', 'diffrn', 'entry', 'exptl', 'exptl_crystal', 'pdbx_audit_revision_category', 'pdbx_audit_revision_details', 'pdbx_audit_revision_group', 'pdbx_audit_revision_history', 'pdbx_audit_revision_item', 'pdbx_database_pdbobs_spr', 'pdbx_database_related', 'pdbx_database_status', 'pdbx_vrpt_summary', 'pdbx_vrpt_summary_geometry', 'rcsb_accession_info', 'rcsb_entry_container_identifiers', 'rcsb_entry_info', 'rcsb_primary_citation', 'refine', 'refine_hist', 'struct', 'struct_keywords', 'symmetry', 'rcsb_id'])

In [17]:
info_4hhb["exptl"]

[{'method': 'X-RAY DIFFRACTION'}]

In [19]:
info_4hhb["struct_keywords"]

{'pdbx_keywords': 'OXYGEN TRANSPORT', 'text': 'OXYGEN TRANSPORT'}

In [21]:
info_4hhb["struct"]

{'title': 'THE CRYSTAL STRUCTURE OF HUMAN DEOXYHAEMOGLOBIN AT 1.74 ANGSTROMS RESOLUTION'}

In [23]:
info_4hhb["rcsb_entry_info"]

{'assembly_count': 1,
 'branched_entity_count': 0,
 'cis_peptide_count': 0,
 'deposited_atom_count': 4779,
 'deposited_deuterated_water_count': 0,
 'deposited_hydrogen_atom_count': 0,
 'deposited_model_count': 1,
 'deposited_modeled_polymer_monomer_count': 574,
 'deposited_nonpolymer_entity_instance_count': 6,
 'deposited_polymer_entity_instance_count': 4,
 'deposited_polymer_monomer_count': 574,
 'deposited_solvent_atom_count': 221,
 'deposited_unmodeled_polymer_monomer_count': 0,
 'disulfide_bond_count': 0,
 'entity_count': 5,
 'experimental_method': 'X-ray',
 'experimental_method_count': 1,
 'inter_mol_covalent_bond_count': 0,
 'inter_mol_metalic_bond_count': 4,
 'molecular_weight': 64.74,
 'na_polymer_entity_types': 'Other',
 'nonpolymer_bound_components': ['HEM'],
 'nonpolymer_entity_count': 2,
 'nonpolymer_molecular_weight_maximum': 0.62,
 'nonpolymer_molecular_weight_minimum': 0.09,
 'polymer_composition': 'heteromeric protein',
 'polymer_entity_count': 2,
 'polymer_entity_count

In [33]:
interface = requests.get("https://data.rcsb.org/rest/v1/core/interface/4hhb/1/2")

In [35]:
interface.status_code

200

In [37]:
interface_info = interface.json()
interface_info["rcsb_interface_info"]

{'polymer_composition': 'Protein (only)',
 'interface_character': 'hetero',
 'interface_area': 824.2706996807091,
 'num_interface_residues': 43,
 'num_core_interface_residues': 11}

In [39]:
import json

my_query = {
  "query": {
    "type": "terminal",
    "service": "full_text",
    "parameters": {
        "value": '"oxygen storage"'
    }
  },
  
  "return_type": "entry"
}

my_query = json.dumps(my_query)

In [41]:
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")
results = data.json()
results

{'query_id': '6be57596-b1ab-43dd-b26b-5b22b14770eb',
 'result_type': 'entry',
 'total_count': 673,
 'result_set': [{'identifier': '1UVY', 'score': 1.0},
  {'identifier': '1UVX', 'score': 0.9861786942153946},
  {'identifier': '2BMM', 'score': 0.9861786942153946},
  {'identifier': '1UX8', 'score': 0.9596514074005238},
  {'identifier': '2AWC', 'score': 0.9480013391522119},
  {'identifier': '7DDS', 'score': 0.9480013391522119},
  {'identifier': '2EB8', 'score': 0.9469157998118357},
  {'identifier': '2EF2', 'score': 0.9469157998118357},
  {'identifier': '1D8U', 'score': 0.9345138682873109},
  {'identifier': '1DUK', 'score': 0.9345138682873109}]}

In [43]:
first_result = results["result_set"][0]["identifier"]
print(first_result)

1UVY


In [45]:
data = requests.get(f"https://data.rcsb.org/rest/v1/core/entry/{first_result}")
result = data.json()
result["struct"]

{'title': 'HEME-LIGAND TUNNELING IN GROUP I TRUNCATED HEMOGLOBINS'}

In [47]:
my_query = {
  "query": {
    "type": "terminal",
    "service": "text",
    "parameters": {
        "attribute": "struct_keywords.pdbx_keywords",
        "operator": "contains_phrase",
        "value": '"oxygen storage"'
    }
  },

  "request_options": {
    "paginate": {
      "start": 0,
      "rows": 50,
    },
    "sort": [
      {
      "sort_by": "rcsb_accession_info.initial_release_date",
      "direction": "asc" 
    }
    ]
  },
  
  "return_type": "entry"
}

my_query = json.dumps(my_query)

In [49]:
data = requests.get(f"https://search.rcsb.org/rcsbsearch/v2/query?json={my_query}")

In [51]:
data.status_code

200

In [53]:
results = data.json()
results

{'query_id': '4e2e46dc-b53a-4072-ae39-b5739a8abc1c',
 'result_type': 'entry',
 'total_count': 576,
 'result_set': [{'identifier': '1MBN', 'score': 1.0},
  {'identifier': '1MBD', 'score': 1.0},
  {'identifier': '1MBO', 'score': 1.0},
  {'identifier': '1MBC', 'score': 1.0},
  {'identifier': '4MBN', 'score': 1.0},
  {'identifier': '5MBN', 'score': 1.0},
  {'identifier': '1MBA', 'score': 1.0},
  {'identifier': '1PMB', 'score': 1.0},
  {'identifier': '3MBA', 'score': 1.0},
  {'identifier': '4MBA', 'score': 1.0},
  {'identifier': '2MB5', 'score': 1.0},
  {'identifier': '1MBI', 'score': 1.0},
  {'identifier': '5MBA', 'score': 1.0},
  {'identifier': '2FAL', 'score': 1.0},
  {'identifier': '2FAM', 'score': 1.0},
  {'identifier': '1MYG', 'score': 1.0},
  {'identifier': '1MYH', 'score': 1.0},
  {'identifier': '1MYI', 'score': 1.0},
  {'identifier': '1MYJ', 'score': 1.0},
  {'identifier': '1YCA', 'score': 1.0},
  {'identifier': '1YCB', 'score': 1.0},
  {'identifier': '2MGA', 'score': 1.0},
  {'ide

In [55]:
## Step 1
from Bio.PDB import PDBList

# Create an instance of the PDBList class
pdb_list = PDBList()

# Download all of the structure files
for result in results["result_set"]:
    pdb_id = result["identifier"].lower()

    # Download the MMCIF file using the retrieve_pdb_file method
    pdb_filename = pdb_list.retrieve_pdb_file(pdb_id, pdir="pdb_files", file_format="mmCif")


## Step 2

from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import NeighborSearch
from collections import Counter

# Create an MMCIFParser object to parse mmCIF files.
parser = MMCIFParser(QUIET=True)

# Define the maximum distance (in Ångströms) for identifying neighboring residues.
cutoff_distance = 5

# Initialize a dictionary to store the neighboring residues for each protein structure.
residue_neighbors = {}

# The 'results' variable is a dictionary containing search results.
# Each result in 'results["result_set"]' represents a protein structure with a PDB ID.
for result in results["result_set"]:
    # Extract the PDB ID and convert it to lowercase.
    pdb_id = result["identifier"].lower()

    # Parse the mmCIF file for the protein structure using the PDB ID.
    # The file is expected to be located in the 'pdb_files' directory.
    structure = parser.get_structure(pdb_id, f"pdb_files/{pdb_id}.cif")
    
    # Extract all atoms from the protein structure.
    atoms = list(structure.get_atoms())
    
    # Create a NeighborSearch object to perform neighbor searches.
    neighbor_search = NeighborSearch(atoms)
    
    # Initialize a list to store the neighboring residues for this protein structure.
    neighbor_list = []

    # Loop through the atoms in the protein structure.
    for atom in atoms:
        # Check if the atom is an iron (Fe) atom.
        if atom.element == "FE":
            # Get the parent residue of the iron atom.
            iron_residue = atom.get_parent()

            # Find atoms within the cutoff distance from the iron atom.
            neighbors = neighbor_search.search(atom.get_coord(), cutoff_distance)
            
            # Loop through the neighboring atoms.
            for neighbor in neighbors:
                # Get the parent residue of the neighboring atom.
                residue = neighbor.get_parent()
    
                # Check if the neighboring residue is different from the iron-containing residue.
                if residue != iron_residue:
                    # Add the neighboring residue to the list.
                    neighbor_list.append(residue)
                    
    # Store the unique neighboring residues in the dictionary using the PDB ID as the key.
    residue_neighbors[pdb_id] = set(neighbor_list)

# The 'residue_neighbors' dictionary contains the neighboring residues for each protein structure.


## Step 3
## Now we will want to count the residue neighbor types.
# Initialize an empty Counter object to store the counts of residue combinations.
combination_counts = Counter()

# Iterate over the items in the 'residue_neighbors' dictionary.
# Each item consists of a PDB ID ('pdb_id') and a set of neighboring residues ('neighbors') to iron atoms.
for pdb_id, neighbors in residue_neighbors.items():
    # Extract the residue names ('resname') for each neighboring residue using the 'get_resname' method.
    resname = [x.get_resname() for x in neighbors if x.get_resname()]
    
    # Count the occurrences of each residue name in the current combination.
    res_counts = Counter(resname)
    
    # Convert the residue counts to a tuple of (residue, count) pairs, sorted by residue name.
    # This standardizes the representation of each combination.
    combination = tuple(sorted(res_counts.items()))
    
    # Update the combination_counts with the current combination.
    combination_counts.update([combination])

# Use the 'most_common' method to get the most common residue combinations.
# The result is a list of tuples, where each tuple contains a combination and its count.
most_common_combinations = combination_counts.most_common()

# For example, to get the top 5 most common combinations, use 'most_common(5)'.
top_5_combinations = combination_counts.most_common(5)
print("\nTop 5 most common residue combinations for iron neighbors:")
for combination, count in top_5_combinations:
    combination_str = ', '.join([f"{count} {residue}" for residue, count in combination])
    print(f"Combination: {combination_str}, Count: {count}")

Downloading PDB structure '1mbn'...
Downloading PDB structure '1mbd'...
Downloading PDB structure '1mbo'...
Downloading PDB structure '1mbc'...
Downloading PDB structure '4mbn'...
Downloading PDB structure '5mbn'...
Downloading PDB structure '1mba'...
Downloading PDB structure '1pmb'...
Downloading PDB structure '3mba'...
Downloading PDB structure '4mba'...
Downloading PDB structure '2mb5'...
Downloading PDB structure '1mbi'...
Downloading PDB structure '5mba'...
Downloading PDB structure '2fal'...
Downloading PDB structure '2fam'...
Downloading PDB structure '1myg'...
Downloading PDB structure '1myh'...
Downloading PDB structure '1myi'...
Downloading PDB structure '1myj'...
Downloading PDB structure '1yca'...
Downloading PDB structure '1ycb'...
Downloading PDB structure '2mga'...
Downloading PDB structure '2mgb'...
Downloading PDB structure '2mgc'...
Downloading PDB structure '2mgd'...
Downloading PDB structure '2mge'...
Downloading PDB structure '2mgf'...
Downloading PDB structure '2