### Helping functions

In [1]:
def LIP_tagging(ds_original, ds_residues):
    #ds_residues is the target where all information about the res are
    #ds_original is the initial dataset
    #For every protein we take the information of where LIP residue are
    for i in range(0, np.shape(ds_original)[0]):
        row = ds_original.iloc[i, :]
        pdb = row[0]
        chain = row[1]
        start = row[2]
        end = row[3]
        #Now we set to 1 all the residue whose features are the one desired 
        ds_residues['LIP'][(ds_residues['PDB_ID'] == pdb)
                           & (ds_residues['CHAIN_ID'] == chain)
                           & (ds_residues['RES_ID'] <= end)
                           & (ds_residues['RES_ID'] >= start)] = 1
        
def down_sampling(df, number_of_samples, seed = 42):
    noLIP_index = set(df[df['LIP'] == 0].index)
    indexes = set(np.arange(0, np.shape(df)[0]))
    sample = random.sample(noLIP_index, len(noLIP_index) - number_of_samples)
    new_index = indexes.difference(sample)
    df1 = df.iloc[list(new_index), :]
    return df1

### Importing libraries

In [2]:
# Import default libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import time
import random
import warnings
# Import Biopython utils
from Bio.PDB import PDBList, is_aa, PDBIO
from Bio.PDB.PDBParser import PDBParser
from Bio.SeqUtils import IUPACData
from Bio.PDB.PDBIO import Select

### Downloading proteins (SKIP if already done!)

In [None]:
#####To do only if proteins are not dowloaded!!!!!
# Select all proteins (pdb column)
pdb_ids = ds_original.pdb.unique()
# Define pdb files dir
pdb_dir = './pdb_files'
# Define pdb file fetching class
pdbl = PDBList()
# Fetch every protein
for pdb_id in pdb_ids:
    # Execute fetching of the protein (pdb file)
    pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')

### Importing dataset

In [3]:
# Read original dataset (lips_dataset)
ds_original = pd.read_csv('./datasets/lips_dataset.txt', sep='\t')
# Define new dataset
ds_original.head()

Unnamed: 0,pdb,chain,start,end,type
0,1cee,B,1,47,long
1,1dev,B,669,709,long
2,1dow,B,118,148,long
3,1fqj,C,63,87,long
4,1g3j,B,2,29,long


### Creating DataFrame of all the residue

In [4]:
# Select all proteins (pdb column)
pdb_ids = ds_original.pdb.unique()
# Define pdb files dir
pdb_dir = './pdb_files'
# Define pdb file fetching class
pdbl = PDBList()

In [5]:
# New list for residues
ds_residues = list()
# Loop thorugh every protein
for pdb_id in ds_original.pdb.unique():
    # Get structure of the protein
    structure = PDBParser(QUIET=True).get_structure(pdb_id, pdb_dir + '/pdb{}.ent'.format(pdb_id))
    # Loop through every model -> chain -> residue
    for model in structure:
        for chain in model:
            for residue in chain:
                # Do not take into account non-aminoacidic residues (e.g. water molecules)
                if(not is_aa(residue)): 
                    continue
                # Add an entry to the residues list
                ds_residues.append((pdb_id, model.id, chain.id, residue.id[1], residue.get_resname(), 0, 0))
                                   
# Turn list into dataframe
ds_residues = pd.DataFrame(ds_residues)
# Define dataset column names
ds_residues.columns = ['PDB_ID', 'MODEL_ID', 'CHAIN_ID', 'RES_ID', 'RES_NAME', 'LIP_SCORE', 'LIP']
# Show first rows
print("Numbers of proteins: {}".format(np.shape(ds_original)[0]))
print("Numbers of res: {}".format(np.shape(ds_residues)[0]))
ds_residues.head()

Numbers of proteins: 83
Numbers of res: 86909


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP
0,1cee,0,A,1,MET,0,0
1,1cee,0,A,2,GLN,0,0
2,1cee,0,A,3,THR,0,0
3,1cee,0,A,4,ILE,0,0
4,1cee,0,A,5,LYS,0,0


### Tagging LIP residues

In [6]:
start = time.time()
LIP_tagging(ds_original, ds_residues)
end = time.time()
print('Time: {} seconds'.format(round(end-start, 2)))
ds_residues[ds_residues['LIP'] == 1].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Time: 6.88 seconds


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP
179,1cee,0,B,1,LYS,0,1
180,1cee,0,B,2,LYS,0,1
181,1cee,0,B,3,LYS,0,1
182,1cee,0,B,4,ILE,0,1
183,1cee,0,B,5,SER,0,1


### Down Sampling

In [7]:
print("Before down sampling:\nNumber of LIP: {}".format(len(ds_residues['LIP'][ds_residues['LIP'] == 1])))
print("Number of noLIP: {}".format(len(ds_residues['LIP'][ds_residues['LIP'] == 0])))
start = time.time()

#New dataframe with 50-50 LIP-noLIP
ds_residues_ds = down_sampling(ds_residues, np.shape(ds_residues[ds_residues['LIP'] == 1])[0])
end = time.time()

print('--------------------------------')
print('Time for procedure: {} seconds'.format(np.round((end-start), 2)))
print('--------------------------------')
print("After down sampling:\nNumber of LIP: {}".format(len(ds_residues_ds['LIP'][ds_residues_ds['LIP'] == 1])))
print("Number of noLIP: {}".format(len(ds_residues_ds['LIP'][ds_residues_ds['LIP'] == 0])))
ds_residues_ds.head()

Before down sampling:
Number of LIP: 9997
Number of noLIP: 76912
--------------------------------
Time for procedure: 0.24 seconds
--------------------------------
After down sampling:
Number of LIP: 9997
Number of noLIP: 9997


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP
0,1cee,0,A,1,MET,0,0
10,1cee,0,A,11,ASP,0,0
14,1cee,0,A,15,GLY,0,0
20,1cee,0,A,21,ILE,0,0
24,1cee,0,A,25,THR,0,0
