In [8]:
# Import default libraries
import pandas as pd
import numpy as np
import os
import json
import logging
import argparse
import re
from joblib import dump, load
# Import custom libraries
from modules.feature_extraction import *
from modules.feature_preprocessing import *
from modules.pipelines import *

In [9]:
with open('./config.json') as config_file:
    config = json.load(config_file)
config['lip_file'] = './datasets/lips_dataset.txt'

In [10]:
# Get LIP/non-LIP file
ds_training = pd.read_csv(config.get('lip_file'), sep='\t')
ds_training.head()

Unnamed: 0,pdb,chain,start,end,type
0,1cee,A,neg,neg,long
1,1cee,B,1,47,long
2,1dev,A,neg,neg,long
3,1dev,B,669,709,long
4,1dow,A,neg,neg,long


In [12]:
# Extract PDB ids
pdb_ids = set(ds_training.pdb.unique())
# Remove excluded PDB ids
if config.get('exclude'):
    pdb_ids -= set(config.get('exclude'))
# Filter out invalid chains
config['valid_chains'] = set([(row['pdb'], row['chain']) for idx, row in ds_training.iterrows()])
# Extract features
ds_residues, ds_predict = main_pipeline(pdb_ids, config)
logging.debug(ds_residues.isna().any())
ds_residues.head()

Structure exists: './pdb_files/pdb1p16.ent' 
Structure exists: './pdb_files/pdb1ymh.ent' 
Structure exists: './pdb_files/pdb1hv2.ent' 
Structure exists: './pdb_files/pdb2c1t.ent' 
Structure exists: './pdb_files/pdb1a81.ent' 
Structure exists: './pdb_files/pdb1r1r.ent' 
Structure exists: './pdb_files/pdb1cqt.ent' 
Structure exists: './pdb_files/pdb1nx1.ent' 
Structure exists: './pdb_files/pdb2pg1.ent' 
Structure exists: './pdb_files/pdb2ivz.ent' 
Structure exists: './pdb_files/pdb1jsu.ent' 
Structure exists: './pdb_files/pdb1p22.ent' 
Structure exists: './pdb_files/pdb1hrt.ent' 
Structure exists: './pdb_files/pdb2o8a.ent' 
Structure exists: './pdb_files/pdb1i8h.ent' 
Structure exists: './pdb_files/pdb1kdx.ent' 
Structure exists: './pdb_files/pdb2nl9.ent' 
Structure exists: './pdb_files/pdb2fym.ent' 
Structure exists: './pdb_files/pdb1dt7.ent' 
Structure exists: './pdb_files/pdb1sb0.ent' 
Structure exists: './pdb_files/pdb2a6q.ent' 
Structure exists: './pdb_files/pdb1lm8.ent' 
Structure 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ds_dssp.REL_ASA[ds_dssp.REL_ASA.isna()] = ds_dssp.REL_ASA.mean()
  sliced = sliced.rolling(window = k, center = True).apply(lambda x: np.dot(x,window)/k)


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,CHAIN_LEN,RES_ID,RES_NAME,LIP_SCORE,LIP,RES_NAME_ALA,RES_NAME_ARG,...,EDGE_LOC_SC_LIG,EDGE_LOC_SC_MC,EDGE_LOC_SC_SC,EDGE_TYPE_HBOND,EDGE_TYPE_IAC,EDGE_TYPE_IONIC,EDGE_TYPE_VDW,INTRA_CONTACTS,INTER_CONTACTS,INTRA_INTER_CONTACTS
0,1p16,0.0,B,378.0,1.0,MSE,0.0,0.0,0.0,0.0,...,0,0,0,0,3,0,0,3.0,0.0,30.0
1,1p16,0.0,B,378.0,2.0,VAL,0.0,0.0,0.0,0.0,...,11,0,0,0,10,0,1,11.0,0.0,110.0
2,1p16,0.0,B,378.0,3.0,GLN,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,1.0,0.0,10.0
3,1p16,0.0,B,378.0,4.0,LEU,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,1.0,0.0,10.0
4,1p16,0.0,B,378.0,5.0,GLU,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


In [19]:
ds_predict[ds_predict.isnull()].any()

CHAIN_LEN                         False
RES_NAME_ALA                      False
RES_NAME_ARG                      False
RES_NAME_ASN                      False
RES_NAME_ASP                      False
RES_NAME_CYS                      False
RES_NAME_GDP                      False
RES_NAME_GLN                      False
RES_NAME_GLU                      False
RES_NAME_GLY                      False
RES_NAME_GTP                      False
RES_NAME_HIS                      False
RES_NAME_HYP                      False
RES_NAME_ILE                      False
RES_NAME_LEU                      False
RES_NAME_LYS                      False
RES_NAME_MET                      False
RES_NAME_MSE                      False
RES_NAME_PHE                      False
RES_NAME_PRO                      False
RES_NAME_PTR                      False
RES_NAME_SEP                      False
RES_NAME_SER                      False
RES_NAME_THR                      False
RES_NAME_TPO                      False


In [20]:

# Load saved model
model = load('{}/{}.joblib'.format(config.get('model_dir'), config.get('model_name')))
# Debug
logging.debug('Datasets for training:')
logging.debug(ds_predict.head())
logging.debug(ds_residues.head())
# Train model
model.fit(ds_predict, ds_residues['LIP'])
print('New model has been trained')
# Overwrite the model
dump(model, '{}/{}.joblib'.format(config.get('model_dir'), config.get('model_name')))
print('New model has been saved to disk as {}/{}.joblib'.format(config.get('model_dir'), config.get('model_name')))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [32]:
dn = pd.read_csv('./tmp.csv')
dn[dn.isna().any(axis=1)]

Unnamed: 0,CHAIN_LEN,RES_NAME_ALA,RES_NAME_ARG,RES_NAME_ASN,RES_NAME_ASP,RES_NAME_CYS,RES_NAME_GDP,RES_NAME_GLN,RES_NAME_GLU,RES_NAME_GLY,...,SEC_STRUCT_ZERO,SEC_STRUCT_ISOLATED_BETA_BRIGE,SEC_STRUCT_STRAND,SEC_STRUCT_THREE_TEN_ELIX,SEC_STRUCT_ALPHA_ELIX,SEC_STRUCT_PI_ELIX,SEC_STRUCT_BEND,SEC_STRUCT_TURN,INTRA_CONTACTS,INTER_CONTACTS
1627,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10894,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14352,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15143,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17586,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,0.0
17587,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,0.0,0.0
17588,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,0.0
17589,18.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,0.0,0.0
