In [2]:
# Import default libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
# Import Biopython utils
from Bio.PDB import PDBList, is_aa, PDBIO
from Bio.PDB.PDBParser import PDBParser
from Bio.SeqUtils import IUPACData
from Bio.PDB.PDBIO import Select

In [18]:
# Read original dataset (lips_dataset)
ds_original = pd.read_csv('./datasets/lips_dataset.txt', sep='\t')
# Define new dataset
ds_original.head()

Unnamed: 0,pdb,chain,start,end,type
0,1cee,B,1,47,long
1,1dev,B,669,709,long
2,1dow,B,118,148,long
3,1fqj,C,63,87,long
4,1g3j,B,2,29,long


In [22]:
# Select all proteins (pdb column)
pdb_ids = ds_original.pdb.unique()
# Define pdb files dir
pdb_dir = './pdb_files'
# Define pdb file fetching class
pdbl = PDBList()
# Fetch every protein
for pdb_id in pdb_ids:
    # Execute fetching of the protein (pdb file)
    pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')

Downloading PDB structure '1cee'...
Downloading PDB structure '1dev'...
Downloading PDB structure '1dow'...
Downloading PDB structure '1fqj'...
Downloading PDB structure '1g3j'...
Downloading PDB structure '1hrt'...
Downloading PDB structure '1i7w'...
Downloading PDB structure '1j2j'...
Downloading PDB structure '1jsu'...
Downloading PDB structure '1kil'...
Downloading PDB structure '1l8c'...
Downloading PDB structure '1p4q'...
Downloading PDB structure '1pq1'...
Downloading PDB structure '1q68'...
Downloading PDB structure '1rf8'...
Downloading PDB structure '1sc5'...
Downloading PDB structure '1sqq'...
Downloading PDB structure '1tba'...
Downloading PDB structure '1th1'...
Downloading PDB structure '1xtg'...
Downloading PDB structure '1ymh'...
Downloading PDB structure '1zoq'...
Downloading PDB structure '2a6q'...
Downloading PDB structure '2auh'...
Downloading PDB structure '2c1t'...
Downloading PDB structure '2cly'...
Downloading PDB structure '2o8a'...
Downloading PDB structure '3

In [26]:
# New list for residues
ds_residues = list()
# Loop thorugh every protein
for pdb_id in ds_original.pdb.unique():
    # Get structure of the protein
    structure = PDBParser(QUIET=True).get_structure(pdb_id, pdb_dir + '/pdb{}.ent'.format(pdb_id))
    # Loop through every model -> chain -> residue
    for model in structure:
        for chain in model:
            for residue in chain:
                # Do not take into account non-aminoacidic residues (e.g. water molecules)
                if(not is_aa(residue)): 
                    continue
                # Add an entry to the residues list
                ds_residues.append((pdb_id, model.id, chain.id, residue.id[1], residue.get_resname(), 0, 0))
                                   
# Turn list into dataframe
ds_residues = pd.DataFrame(ds_residues)
# Define dataset column names
ds_residues.columns = ['PDB_ID', 'MODEL_ID', 'CHAIN_ID', 'RES_ID', 'RES_NAME', 'LIP_SCORE', 'LIP']
# Show first rows
ds_residues.head()

Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP
0,1cee,0,A,1,MET,0,0
1,1cee,0,A,2,GLN,0,0
2,1cee,0,A,3,THR,0,0
3,1cee,0,A,4,ILE,0,0
4,1cee,0,A,5,LYS,0,0
