## Code to transform categorical variables into features through One-Hot-Encoding
### Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from modules.feature_extraction import *

### Importing dataset

In [2]:
df = pd.read_csv(r'datasets\residues.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
print('The shape of our dataframe is: {} x {}'.format(df.shape[0], df.shape[1]))
init_features = set(df.columns)
df.head()

The shape of our dataframe is: 18063 x 23


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP,SEC_STRUCT,REL_ASA,PHI,...,O_NH_1_relidx,O_NH_1_energy,NH_O_2_relidx,H_O_2_energy,O_NH_2_relidx,O_NH_2_energy,INTRA_CONTACTS,INTER_CONTACTS,EDGE_LOC,EDGE_TYPE
0,1cee,0,A,1,MET,0,0,-,1.0,360.0,...,2.0,-0.3,0.0,0.0,50.0,-0.1,0.0,0.0,,
1,1cee,0,A,2,GLN,0,0,-,0.348485,-91.3,...,50.0,-1.7,51.0,-0.0,2.0,-0.3,2.0,1.0,SC_SC MC_MC SC_SC,VDW HBOND HBOND
2,1cee,0,A,3,THR,0,0,E,0.387324,-142.6,...,50.0,-0.2,48.0,-0.2,2.0,-0.2,0.0,0.0,,
3,1cee,0,A,4,ILE,0,0,E,0.005917,-94.3,...,50.0,-1.6,-2.0,-0.3,2.0,-0.6,5.0,0.0,MC_MC MC_MC SC_SC SC_SC SC_SC,HBOND HBOND VDW VDW VDW
4,1cee,0,A,5,LYS,0,0,E,0.346341,-112.5,...,71.0,-1.6,48.0,-0.1,2.0,-0.8,2.0,0.0,SC_MC MC_MC,VDW HBOND


### Procedure for RES_NAME

In [3]:
#Computing new features
df = res2features(df)
#Printing the new features added
res_features = set(df.columns)
print('The features added are: {}'.format(res_features.difference(init_features)))
#Visualize dataset
df.head()

The features added are: {'THR', 'ARG', 'LYS', 'ILE', 'MSE', 'CYS', 'PHE', 'ASN', 'PTR', 'MET', 'TRP', 'HYP', 'TYR', 'ALA', 'GLN', 'GDP', 'GLU', 'HIS', 'GLY', 'TPO', 'SER', 'ASP', 'LEU', 'PRO', 'GTP', 'VAL', 'SEP'}


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP,SEC_STRUCT,REL_ASA,PHI,...,PHE,PRO,PTR,SEP,SER,THR,TPO,TRP,TYR,VAL
0,1cee,0,A,1,MET,0,0,-,1.0,360.0,...,0,0,0,0,0,0,0,0,0,0
1,1cee,0,A,2,GLN,0,0,-,0.348485,-91.3,...,0,0,0,0,0,0,0,0,0,0
2,1cee,0,A,3,THR,0,0,E,0.387324,-142.6,...,0,0,0,0,0,1,0,0,0,0
3,1cee,0,A,4,ILE,0,0,E,0.005917,-94.3,...,0,0,0,0,0,0,0,0,0,0
4,1cee,0,A,5,LYS,0,0,E,0.346341,-112.5,...,0,0,0,0,0,0,0,0,0,0


### Procedure for SEC_STRUCT

In [4]:
#Computing new features
df = struct2features(df)
#Printing the new features added
struct_features = set(df.columns)
print('The features added are: {}'.format(struct_features.difference(res_features)))
#Visualize dataset
df.head()

The features added are: {'ISOLATED_BETA_BRIGE', 'BEND', '10_ELIX', 'TURN', 'PI_ELIX', 'ZERO', 'ALPHA_ELIX', 'STRAND', 'NO_STRUCT'}


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP,SEC_STRUCT,REL_ASA,PHI,...,VAL,10_ELIX,ALPHA_ELIX,BEND,ISOLATED_BETA_BRIGE,NO_STRUCT,PI_ELIX,STRAND,TURN,ZERO
0,1cee,0,A,1,MET,0,0,NO_STRUCT,1.0,360.0,...,0,0,0,0,0,1,0,0,0,0
1,1cee,0,A,2,GLN,0,0,NO_STRUCT,0.348485,-91.3,...,0,0,0,0,0,1,0,0,0,0
2,1cee,0,A,3,THR,0,0,STRAND,0.387324,-142.6,...,0,0,0,0,0,0,0,1,0,0
3,1cee,0,A,4,ILE,0,0,STRAND,0.005917,-94.3,...,0,0,0,0,0,0,0,1,0,0
4,1cee,0,A,5,LYS,0,0,STRAND,0.346341,-112.5,...,0,0,0,0,0,0,0,1,0,0


### Procedure for contacts

In [5]:
df_ring = pd.read_csv(r'datasets\ring.csv')
df_ring.drop('Unnamed: 0', axis=1, inplace=True)
df_ring.head()

Unnamed: 0,PDB_ID,CHAIN_ID_A,RES_ID_A,CHAIN_ID_B,RES_ID_B,EDGE_LOC,ATOM_A,ATOM_B,EDGE_TYPE
0,1cee,A,180,A,16,LIG_SC,HZ2,MG,IAC
1,1cee,A,16,A,180,LIG_SC,MG,HZ2,IAC
2,1cee,A,180,A,17,LIG_SC,HG1,MG,IAC
3,1cee,A,17,A,180,LIG_SC,MG,HG1,IAC
4,1cee,A,180,A,17,LIG_SC,OG1,MG,IAC


In [6]:
#Computing ring features
df = contacts2features(df, df_ring)
#Printing the new features added
ring_features = set(df.columns)
print('The features added are: {}'.format(ring_features.difference(struct_features)))
#Visualize dataset
df.head()

The features added are: {'SC_MC', 'MC_MC', 'MC_SC', 'LIG_SC', 'IAC', 'RES_ID_A', 'SC_SC', 'CHAIN_ID_A', 'HBOND', 'PIPISTACK', 'LIG_MC', 'VDW'}


Unnamed: 0,PDB_ID,MODEL_ID,CHAIN_ID,RES_ID,RES_NAME,LIP_SCORE,LIP,SEC_STRUCT,REL_ASA,PHI,...,LIG_MC,LIG_SC,MC_MC,MC_SC,SC_MC,SC_SC,HBOND,IAC,PIPISTACK,VDW
0,1cee,0,A,1,MET,0,0,NO_STRUCT,1.0,360.0,...,,,,,,,,,,
1,1cee,0,A,2,GLN,0,0,NO_STRUCT,0.348485,-91.3,...,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,1.0
2,1cee,0,A,3,THR,0,0,STRAND,0.387324,-142.6,...,,,,,,,,,,
3,1cee,0,A,4,ILE,0,0,STRAND,0.005917,-94.3,...,0.0,0.0,2.0,0.0,0.0,3.0,2.0,0.0,0.0,3.0
4,1cee,0,A,5,LYS,0,0,STRAND,0.346341,-112.5,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
