# Spliting of the Dataset and Scaling


## Packages

In [1]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime as dt

# Scikit-Learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

## Load Data
Read list of drugs with molecular descriptors and remove all entries containing NaN or values showing infinit size

In [2]:
file = '../../../nephrosan/molwork/babelfromdruglist.csv'
df = pd.read_csv(file, low_memory = False)
df= df.replace(['#NAME?', np.inf, -np.inf, 'inf', '-inf', 'nan'], np.nan)
df.dropna(axis=0, how='any', inplace=True)

## Train Test Split
Remove features which are not molecular descriptors from the dataset. Move the outcome "NEPHROTOXIC" into a separate array. Then split data set into training and test set.

In [3]:
# Split data set in feature and target
X = df.drop(labels=['Drug', 'NEPHROTOXIC', 'Isomeric_smiles'], axis=1)
X = X.astype(float)
y1 = df[['Drug', 'Isomeric_smiles']]
y2 = df[['NEPHROTOXIC']]

In [4]:
# Split data set in training and test set. 20% of data will be moved to the test set.
X_train, X_test, y_train, y_test = train_test_split(
    X, y2,
    test_size=0.2,
    random_state=78)

## Scaling
Perform the scaling using the training data. Transform the test data using the scales from the training data.

### MinMaxScaler

In [5]:
scaler = MinMaxScaler()
X_train_minmax = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index=X_train.index)
X_test_minmax = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index=X_test.index)

### StandardScaler

In [6]:
scaler = StandardScaler()
X_train_std = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index=X_train.index)
X_test_std = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index=X_test.index)

### RobustScaler

In [7]:
scaler = RobustScaler()
X_train_rob = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index=X_train.index)
X_test_rob = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index=X_test.index)

## Export / Save for Next Working Step
### MinMaxScaler

In [8]:
exobj = (X_train_minmax, X_test_minmax, y_train, y_test)
timestamp = dt.now().strftime("%Y%m%d_%H%M%S")
filename = '../pickles/MinMaxScaler_%s.pickle' %timestamp
filehandler = open(filename, 'wb')
pickle.dump(exobj, filehandler)

### StandardScaler

In [9]:
exobj = (X_train_std, X_test_std, y_train, y_test)
timestamp = dt.now().strftime("%Y%m%d_%H%M%S")
filename = '../pickles/StandardScaler_%s.pickle' %timestamp
filehandler = open(filename, 'wb')
pickle.dump(exobj, filehandler)

### RobustScaler

In [10]:
exobj = (X_train_rob, X_test_rob, y_train, y_test)
timestamp = dt.now().strftime("%Y%m%d_%H%M%S")
filename = '../pickles/RobustScaler_%s.pickle' %timestamp
filehandler = open(filename, 'wb')
pickle.dump(exobj, filehandler)