## Code to transform categorical variables into features through One-Hot-Encoding
### Libraries

In [33]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from modules.feature_extraction import *

### Importing dataset

In [34]:
df = pd.read_csv(r'datasets\residues.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
init_features = set(df.columns)
print('The shape of our dataframe is: {} x {}'.format(df.shape[0], df.shape[1]))
print('Original features: {}'.format(init_features))
df.head()

The shape of our dataframe is: 18063 x 23
Original features: {'O_NH_2_relidx', 'MODEL_ID', 'EDGE_TYPE', 'PSI', 'EDGE_LOC', 'PDB_ID', 'O_NH_2_energy', 'SEC_STRUCT', 'LIP_SCORE', 'INTER_CONTACTS', 'RES_NAME', 'O_NH_1_energy', 'NH_O_2_relidx', 'CHAIN_ID', 'NH_O_1_relidx', 'H_O_2_energy', 'INTRA_CONTACTS', 'LIP', 'REL_ASA', 'RES_ID', 'PHI', 'O_NH_1_relidx', 'NH_O_1_energy'}


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP,SEC_STRUCT,REL_ASA,PHI,...,O_NH_1_relidx,O_NH_1_energy,NH_O_2_relidx,H_O_2_energy,O_NH_2_relidx,O_NH_2_energy,INTRA_CONTACTS,INTER_CONTACTS,EDGE_LOC,EDGE_TYPE
0,1cee,0,A,1,MET,0,0,-,1.0,360.0,...,2.0,-0.3,0.0,0.0,50.0,-0.1,0.0,0.0,,
1,1cee,0,A,2,GLN,0,0,-,0.348485,-91.3,...,50.0,-1.7,51.0,-0.0,2.0,-0.3,2.0,1.0,SC_SC MC_MC SC_SC,VDW HBOND HBOND
2,1cee,0,A,3,THR,0,0,E,0.387324,-142.6,...,50.0,-0.2,48.0,-0.2,2.0,-0.2,0.0,0.0,,
3,1cee,0,A,4,ILE,0,0,E,0.005917,-94.3,...,50.0,-1.6,-2.0,-0.3,2.0,-0.6,5.0,0.0,MC_MC MC_MC SC_SC SC_SC SC_SC,HBOND HBOND VDW VDW VDW
4,1cee,0,A,5,LYS,0,0,E,0.346341,-112.5,...,71.0,-1.6,48.0,-0.1,2.0,-0.8,2.0,0.0,SC_MC MC_MC,VDW HBOND


### Procedure for RES_NAME

In [35]:
#Computing new features
df = res2features(df)
#Printing the new features added
res_features = set(df.columns)
print('The features added are: {}'.format(res_features.difference(init_features)))
#Visualize dataset
df.drop(['RES_NAME', 'MODEL_ID'], axis = 1, inplace=True)
df.head()

The features added are: {'PTR', 'GDP', 'ILE', 'ASN', 'MSE', 'VAL', 'TPO', 'TRP', 'GTP', 'TYR', 'GLN', 'LEU', 'PRO', 'LYS', 'ARG', 'SER', 'HIS', 'GLU', 'ALA', 'GLY', 'ASP', 'THR', 'SEP', 'CYS', 'HYP', 'PHE', 'MET'}


  return matrix(data, dtype=dtype, copy=False)


Unnamed: 0,PDB_ID,CHAIN_ID,RES_ID,LIP_SCORE,LIP,SEC_STRUCT,REL_ASA,PHI,PSI,NH_O_1_relidx,...,PHE,PRO,PTR,SEP,SER,THR,TPO,TRP,TYR,VAL
0,1cee,A,1,0,0,-,1.0,360.0,97.6,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1cee,A,2,0,0,-,0.348485,-91.3,147.8,48.0,...,0,0,0,0,0,0,0,0,0,0
2,1cee,A,3,0,0,E,0.387324,-142.6,136.7,-2.0,...,0,0,0,0,0,1,0,0,0,0
3,1cee,A,4,0,0,E,0.005917,-94.3,154.4,48.0,...,0,0,0,0,0,0,0,0,0,0
4,1cee,A,5,0,0,E,0.346341,-112.5,70.5,-2.0,...,0,0,0,0,0,0,0,0,0,0


### Procedure for SEC_STRUCT

In [36]:
#Computing new features
df = struct2features(df)
#Printing the new features added
struct_features = set(df.columns)
print('The features added are: {}'.format(struct_features.difference(res_features)))
#Visualize dataset
df.drop(['SEC_STRUCT'], axis = 1, inplace= True)
df.head()

The features added are: {'ALPHA_ELIX', 'ZERO', 'ISOLATED_BETA_BRIGE', '10_ELIX', 'TURN', 'BEND', 'PI_ELIX', 'NO_STRUCT', 'STRAND'}


  return matrix(data, dtype=dtype, copy=False)


Unnamed: 0,PDB_ID,CHAIN_ID,RES_ID,LIP_SCORE,LIP,REL_ASA,PHI,PSI,NH_O_1_relidx,NH_O_1_energy,...,VAL,10_ELIX,ALPHA_ELIX,BEND,ISOLATED_BETA_BRIGE,NO_STRUCT,PI_ELIX,STRAND,TURN,ZERO
0,1cee,A,1,0,0,1.0,360.0,97.6,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
1,1cee,A,2,0,0,0.348485,-91.3,147.8,48.0,-0.1,...,0,0,0,0,0,1,0,0,0,0
2,1cee,A,3,0,0,0.387324,-142.6,136.7,-2.0,-0.3,...,0,0,0,0,0,0,0,1,0,0
3,1cee,A,4,0,0,0.005917,-94.3,154.4,48.0,-1.9,...,0,0,0,0,0,0,0,1,0,0
4,1cee,A,5,0,0,0.346341,-112.5,70.5,-2.0,-0.2,...,0,0,0,0,0,0,0,1,0,0


In [46]:
df.RES_ID[df.PDB_ID == '1a3b'].head(200)

7788     16
7789     17
7790     18
7791     19
7792     20
7793     21
7794     22
7795     23
7796     24
7797     25
7798     26
7799     27
7800     28
7801     29
7802     30
7803     31
7804     32
7805     33
7806     34
7807     35
7808     36
7809     37
7810     37
7811     37
7812     37
7813     38
7814     39
7815     40
7816     41
7817     42
       ... 
7958     81
7959     82
7960     83
7961     84
7962     85
7963     86
7964     87
7965     88
7966     89
7967     90
7968     91
7969     92
7970     93
7971     94
7972     95
7973     96
7974     97
7975     97
7976     97
7977     97
7978     98
7979     99
7980    100
7981    101
7982    102
7983    103
7984    104
7985    105
7986    106
7987    107
Name: RES_ID, Length: 200, dtype: int64

In [25]:
df.shape

(18063, 56)

### Procedure for EDGE_TYPE and EDGE_LOC

In [12]:
#Computing ring features
df = contacts2features(df, ignore_warnings = True)
#Printing the new features added
ring_features = set(df.columns)
print('The features added are: {}'.format(ring_features.difference(struct_features)))
#Visualize dataset
df.drop(['EDGE_LOC', 'EDGE_TYPE'], axis = 1, inplace=True)
df.head()

The features added are: {'VDW', 'IAC', 'LIG_MC', 'PIPISTACK', 'SC_SC', 'NO_EDGE_LOC', 'HBOND', 'SC_MC', 'LIG_SC', 'MC_SC', 'MC_MC', 'NO_EDGE_TYPE'}


Unnamed: 0,PDB_ID,CHAIN_ID,RES_ID,LIP_SCORE,LIP,REL_ASA,PHI,PSI,NH_O_1_relidx,NH_O_1_energy,...,MC_MC,MC_SC,NO_EDGE_LOC,SC_MC,SC_SC,HBOND,IAC,NO_EDGE_TYPE,PIPISTACK,VDW
0,1cee,A,1,0,0,1.0,360.0,97.6,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0
1,1cee,A,2,0,0,0.348485,-91.3,147.8,48.0,-0.1,...,1,0,0,0,2,2,0,0,0,1
2,1cee,A,3,0,0,0.387324,-142.6,136.7,-2.0,-0.3,...,0,0,1,0,0,0,0,1,0,0
3,1cee,A,4,0,0,0.005917,-94.3,154.4,48.0,-1.9,...,2,0,0,0,3,2,0,0,0,3
4,1cee,A,5,0,0,0.346341,-112.5,70.5,-2.0,-0.2,...,1,0,0,1,0,1,0,0,0,1


### Add chain lengths as variable

In [13]:
df = chain_len(df, get_time=True, ignore_warnings=True)
df.head()

Time passed: 7.2083258628845215


Unnamed: 0,PDB_ID,CHAIN_ID,RES_ID,LIP_SCORE,LIP,REL_ASA,PHI,PSI,NH_O_1_relidx,NH_O_1_energy,...,MC_SC,NO_EDGE_LOC,SC_MC,SC_SC,HBOND,IAC,NO_EDGE_TYPE,PIPISTACK,VDW,CHAIN_LEN
0,1cee,A,1,0,0,1.0,360.0,97.6,0.0,0.0,...,0,1,0,0,0,0,1,0,0,179
1,1cee,A,2,0,0,0.348485,-91.3,147.8,48.0,-0.1,...,0,0,0,2,2,0,0,0,1,179
2,1cee,A,3,0,0,0.387324,-142.6,136.7,-2.0,-0.3,...,0,1,0,0,0,0,1,0,0,179
3,1cee,A,4,0,0,0.005917,-94.3,154.4,48.0,-1.9,...,0,0,0,3,2,0,0,0,3,179
4,1cee,A,5,0,0,0.346341,-112.5,70.5,-2.0,-0.2,...,0,0,1,0,1,0,0,0,1,179


### Recap of features

In [8]:
print("1. Initial features:\n{}\n"
      .format(init_features))

print("Features added with RES_NAME:\n{}\n"
      .format(res_features.difference(init_features)))

print("2. Features added with SEC_STRUCT:\n{}\n"
      .format(struct_features.difference(res_features)))

print("3. Features added with EDGE_TYPE and EDGE_LOC:\n{}\n"
      .format(ring_features.difference(struct_features)))

print("4. Features added with CHAIN_ID:\n{}\n"
      .format("{'CHAIN_LEN'}'"))

print("FEATURES_REMOVED:\n{}".
      format(init_features.difference(set(df.columns))))

1. Initial features:
{'NH_O_1_relidx', 'NH_O_1_energy', 'SEC_STRUCT', 'RES_ID', 'PSI', 'MODEL_ID', 'O_NH_1_energy', 'LIP_SCORE', 'INTER_CONTACTS', 'O_NH_2_relidx', 'CHAIN_ID', 'O_NH_2_energy', 'RES_NAME', 'PHI', 'EDGE_LOC', 'PDB_ID', 'REL_ASA', 'LIP', 'EDGE_TYPE', 'O_NH_1_relidx', 'NH_O_2_relidx', 'INTRA_CONTACTS', 'H_O_2_energy'}

Features added with RES_NAME:
{'GLU', 'SER', 'PHE', 'ASP', 'HIS', 'THR', 'TRP', 'PRO', 'LEU', 'ALA', 'HYP', 'GLY', 'VAL', 'LYS', 'CYS', 'ILE', 'ARG', 'TPO', 'GDP', 'MET', 'MSE', 'GLN', 'TYR', 'ASN', 'SEP', 'PTR', 'GTP'}

2. Features added with SEC_STRUCT:
{'NO_STRUCT', 'PI_ELIX', '10_ELIX', 'ISOLATED_BETA_BRIGE', 'ZERO', 'ALPHA_ELIX', 'STRAND', 'TURN', 'BEND'}

3. Features added with EDGE_TYPE and EDGE_LOC:
{'NO_EDGE_LOC', 'PIPISTACK', 'LIG_SC', 'SC_SC', 'MC_SC', 'MC_MC', 'VDW', 'NO_EDGE_TYPE', 'SC_MC', 'IAC', 'HBOND', 'LIG_MC'}

4. Features added with CHAIN_ID:
{'CHAIN_LEN'}'

FEATURES_REMOVED:
{'RES_NAME', 'EDGE_TYPE', 'MODEL_ID', 'EDGE_LOC', 'SEC_STRUCT'}

### Save dataset

In [22]:
df.to_csv(r'datasets\one-hot-enc.csv', index=None)