## Feature Preprocessing
This notebook prepares the data for input into a neural network. Since each feature may be different orders of magnitude, this can bias the learning algorithm. Thus, it is important to perform normalization and standardization on the data. Some molecular descriptor vectors contained NaN values which needed to be set to 0 to not offset the data. Sklearn's RobustScaler was used to center and scale each feature based on percentiles.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

In [50]:
# FeatureProcessor takes in a feature vector representing a single protein or chemical, a trained scaler,
# & max and min
# scaler will be pscaler or cscaler depending on protein or chemical
# max and min will be pmax/pmin for protein and cmax/cmin for chemical
# returns new vector with features scaled and normalized

def FeatureProcessor(feature_vector, scaler, max, min):
    new_vector = np.nan_to_num(feature_vector)
    new_vector = new_vector.reshape(1, -1)
    new_vector = scaler.transform(new_vector)
    new_vector = np.nan_to_num(new_vector)
    new_vector = (new_vector - min) / (max - min)
    new_vector = np.nan_to_num(new_vector)
    new_vector = (new_vector * 2) - 1
    return new_vector

## Preprocessing Protein Features:

In [56]:
# concatenates all protein files into one proteins dataframe; saves column with IDs and removes this column from df
# proteins is the training matrix for the scaler

proteins = pd.DataFrame()

for i in range(1,22):
    pfile = pd.read_csv('PROFEAT_part_' + str(i) + '.out', sep='\t')
    proteins = proteins.append(pfile, ignore_index = True)
    
plabels = proteins[['Feature']]
proteins = proteins.drop(['Feature'], axis=1)

In [118]:
# removes NaN values and trains robust scaler on proteins; gets min and max of the training matrix (proteins) 

proteins = np.nan_to_num(proteins) 
pscaler = RobustScaler().fit(proteins)
pmin = np.ndarray.min(proteins, axis=0)
pmax = np.ndarray.max(proteins, axis=0)

In [97]:
pmax.shape

(1437,)

In [143]:
# saves each new protein feature vector as an npy file
for i in range(0,len(proteins)-1):
    np.save(plabels.iloc[i]['Feature'] + '.npy', 
            FeatureProcessor(proteins[i,:], pscaler, pmax, pmin))

# error? should use proteins.iloc[[i], :]  didn't do this before but now needs this to not throw error?

## Preprocessing Chemical Features:

In [25]:
# concatenates all chemical files into one chemicals dataframe

chemicals = pd.DataFrame()

for i in range(1,53):
    cfile = pd.read_csv(str(i) + '_CIDm_MDs.tsv', sep='\t')
    chemicals = chemicals.append(cfile, ignore_index = True)
    

In [26]:
chemicals.shape
# all molecules present and all features (keep first column with ID for now) 

(259908, 1613)

In [42]:
# choose 25,000 random from chemicals to be the training matrix for the scaler
rand_chemicals = chemicals.sample(n=25000)

In [43]:
rand_chemicals.shape

(25000, 1613)

In [44]:
rand_chemicals = rand_chemicals.reset_index(drop=True)
rclabels = rand_chemicals[['chemical']]
rand_chemicals = rand_chemicals.drop(['chemical'], axis=1)

In [45]:
# removes NaN values and trains robust scaler on rand_chemicals; gets min and max of the training matrix (rand_chemicals) 

rand_chemicals = np.nan_to_num(rand_chemicals) 
cscaler = RobustScaler().fit(rand_chemicals)
cmin = np.ndarray.min(rand_chemicals, axis=0)
cmax = np.ndarray.max(rand_chemicals, axis=0) 

In [46]:
# still need to remove ID column from original chemicals and save it separately
clabels = chemicals[['chemical']] 
chemicals = chemicals.drop(['chemical'], axis=1) 

In [69]:
# runs feature processing function on original chemicals df (all chemicals) using scaler and min/max that were trained
# on rand_chemicals df (25,000 random chemicals) 
# saves each new chemical feature vector as an npy file

for i in range(0,len(chemicals)-1):
    np.save(clabels.iloc[i]['chemical'] + '.npy', 
            FeatureProcessor(chemicals.iloc[[0], :], cscaler, cmax, cmin))
    

  if sys.path[0] == '':
