# Toxikind: Data Preprocessing

In [1]:
# OS I/O
import os
import pickle

# Data handling
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

# Feature Scaler Functions

## Feature Scaler Fitting

In [2]:
def fit_feature_scaler(X_train_raw: pd.DataFrame) -> Pipeline:
    """
    This function fits a MinMaxScaler with raw feature training data
    """
    # ColumnTransformer with MinMaxScaler
    column_transformer = ColumnTransformer([
        ("minmax_scaler", MinMaxScaler(), X_train_raw.columns)
        ])

    # Pipeline
    pipeline = Pipeline([
        ("column_transformer", column_transformer)
        ])

    # Fit feature_scaler with raw feature training data
    feature_scaler = pipeline.fit(X_train_raw)

    # Return fitted feature_scaler
    return feature_scaler

## Feature Scaler Fitting Wrapper with Disk I/O

In [3]:
def fit_save_feature_scaler(path_X_train_raw, path_feature_scaler) -> None:
    """
    This is a wrapper for "processing.fit_feature_scaler".
    It loads raw feature training data from given path,
    calls "processing.train_feature_scaler" and
    saves the trained scaler as .pickle.

    It assumes the raw data index column being unnamed.
    """
    # Laad raw feature training data from disk and set index
    X_train_raw = pd.read_csv(path_X_train_raw).set_index("Unnamed: 0")

    # Fit feature scaler
    feature_scaler = fit_feature_scaler(X_train_raw)

    # Save fitted scaler
    path_feature_scaler = f"{path_feature_scaler}/feature_scaler.pickle"
    with open(path_feature_scaler, "wb") as file:
        pickle.dump(feature_scaler, file)

    return None

In [4]:
# Create/Save scaler / Test
path_X_train_raw = "../raw_data/tox21_dense_train.csv.gz"
path_feature_scaler = "../production_model"
fit_save_feature_scaler(path_X_train_raw, path_feature_scaler)

## Feature Transformer

In [5]:
def transform_features(X_raw: pd.DataFrame, feature_scaler: Pipeline) -> pd.DataFrame:
    """
    This function transforms features using a scaler fitted on raw feature training data
    """
    # Transform features
    X = pd.DataFrame(feature_scaler.transform(X_raw), columns=X_raw.columns, index=X_raw.index)

    # Return transformed data
    return X

## Feature Transformer Wrapper with Disk I/O

In [6]:
def load_transform_save_features(path_feature_scaler, path_x_raw, path_x) -> None:
    """
    This is a wrapper for "processing.transform_features".
    It loads a fitted scaler from given path, raw feature data,
    calls "processing.transform_features" and
    saves the transformed data as .csv.
    
    It assumes the raw data index column being unnamed.
    """
    # Load feature_scaler
    path_feature_scaler = f"{path_feature_scaler}/feature_scaler.pickle"
    with open(path_feature_scaler, "rb") as file:
        feature_scaler = pickle.load(file)

    # Load data
    X_raw = pd.read_csv(path_x_raw).set_index("Unnamed: 0")

    # Transform data
    X = transform_features(X_raw, feature_scaler)

    # Save data
    X.to_csv(path_x)

# Transform Data

## Training Data

**Load/Transform/Save Features**

In [7]:
path_feature_scaler = "../production_model"
path_x_raw = "../raw_data/tox21_dense_train.csv.gz"
path_x = "../data/X_train.csv"
load_transform_save_features(path_feature_scaler, path_x_raw, path_x)

**Load/Save Targets**

In [8]:
path_targets_train = "../raw_data/tox21_labels_train.csv.gz"
y_train = pd.read_csv(path_targets_train).set_index("Unnamed: 0")
y_train.to_csv("../data/y_train.csv")

## Testing Data

**Load/Transform/Save Features**

In [9]:
path_feature_scaler = "../production_model"
path_x_raw = "../raw_data/tox21_dense_test.csv.gz"
path_x = "../data/X_test.csv"
load_transform_save_features(path_feature_scaler, path_x_raw, path_x)

**Load/Save Targets**

In [10]:
path_targets_test = "../raw_data/tox21_labels_test.csv.gz"
y_test = pd.read_csv(path_targets_test).set_index("Unnamed: 0")
y_test.to_csv("../data/y_test.csv")

# Load processed data (double check)

**Training Features**

In [11]:
path_x_train = "../data/X_train.csv"
X_train = pd.read_csv(path_x_train).set_index("Unnamed: 0")
X_train

Unnamed: 0_level_0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,Chi3c,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCGC00178831-03,5.436720e-01,0.012041,0.815898,0.832518,0.244068,0.251849,0.122744,0.255645,0.247884,0.131926,...,0.028705,0.025992,0.084419,0.992430,0.967627,0.086494,0.022640,0.042745,0.000218,0.231025
NCGC00166114-03,1.268818e-01,0.076781,0.774278,0.816115,0.222113,0.217725,0.158927,0.222859,0.235265,0.126033,...,0.023329,0.009094,0.142494,0.991487,0.972305,0.070178,0.012238,0.059568,0.000336,0.258584
NCGC00263563-01,3.076931e-02,0.012270,0.807649,0.836259,0.495232,0.475479,0.312274,0.508771,0.503347,0.393865,...,0.147491,0.215437,0.174151,0.963958,0.913740,0.259019,0.156472,0.097238,0.000542,0.320086
NCGC00013058-02,7.168569e-01,0.010464,0.760780,0.886043,0.539479,0.508602,0.148179,0.491618,0.425273,0.263047,...,0.147532,0.142056,0.165596,0.976968,0.949863,0.227592,0.092400,0.090178,0.000596,0.291745
NCGC00167516-01,7.989701e-02,0.011206,0.796400,0.941583,0.742331,0.736470,0.295701,0.712026,0.641167,0.352710,...,0.461955,0.460399,0.622595,0.825232,0.702753,0.536326,0.439447,0.260218,0.000796,0.500554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCGC00261292-01,1.428572e-01,0.020621,0.749906,0.669640,0.097777,0.100133,0.012882,0.092409,0.088271,0.038183,...,0.003261,0.004034,0.055917,0.998427,0.988861,0.030003,0.002463,0.013849,0.000069,0.157600
NCGC00261245-01,1.193182e-01,0.016474,0.772778,0.753669,0.223264,0.244135,0.098540,0.214848,0.208197,0.054503,...,0.027008,0.034580,0.069424,0.995076,0.981965,0.096647,0.020009,0.035527,0.000232,0.228288
NCGC00260828-01,9.818000e-08,0.007195,0.766779,0.813525,0.351043,0.324683,0.066131,0.333114,0.291330,0.238414,...,0.097991,0.123973,0.195661,0.978034,0.915763,0.260806,0.085466,0.103675,0.000227,0.514170
NCGC00260687-01,2.229000e-08,0.008401,0.850019,0.690360,0.108253,0.117148,0.048162,0.109089,0.115300,0.040802,...,0.004174,0.005102,0.028960,0.999130,0.991930,0.043702,0.003833,0.017157,0.000067,0.181901


**Training Targets**

In [12]:
path_y_train = "../data/y_train.csv"
y_train = pd.read_csv(path_y_train).set_index("Unnamed: 0")
y_train

Unnamed: 0_level_0,NR.AhR,NR.AR,NR.AR.LBD,NR.Aromatase,NR.ER,NR.ER.LBD,NR.PPAR.gamma,SR.ARE,SR.ATAD5,SR.HSE,SR.MMP,SR.p53
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NCGC00178831-03,,,,,,,,,,0.0,,
NCGC00166114-03,,,,,,,,,,0.0,,
NCGC00263563-01,,,,,,,,,,0.0,,
NCGC00013058-02,,,,,,,,,,1.0,,
NCGC00167516-01,,0.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
NCGC00261292-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00261245-01,0.0,0.0,0.0,,,0.0,0.0,,,,,
NCGC00260828-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00260687-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Testing Features**

In [15]:
path_x_test = "../data/X_test.csv"
X_test = pd.read_csv(path_x_test).set_index("Unnamed: 0")
X_test

Unnamed: 0_level_0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,Chi3c,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NCGC00261900-01,2.612482e-01,0.009436,0.834646,0.841727,0.394202,0.403771,0.300542,0.408977,0.398087,0.229952,...,0.103505,0.119139,0.142278,0.979173,0.932708,0.230205,0.098362,0.094579,0.000364,0.347710
NCGC00260869-01,8.333336e-02,0.043783,0.812523,0.754532,0.172691,0.172555,0.097883,0.187859,0.182665,0.135654,...,0.013413,0.007371,0.076107,0.996066,0.972912,0.065423,0.009872,0.055111,0.000187,0.232328
NCGC00261776-01,3.074000e-08,0.007837,0.886389,0.789065,0.155035,0.171825,0.176321,0.168661,0.186405,0.067600,...,0.011447,0.008864,0.068405,0.997175,0.982732,0.052118,0.006282,0.023628,0.000112,0.227702
NCGC00261380-01,8.000004e-02,0.017566,0.779903,0.732086,0.177179,0.186014,0.063751,0.171348,0.161561,0.063016,...,0.015035,0.012926,0.093085,0.995702,0.977549,0.059355,0.007999,0.025659,0.000143,0.227832
NCGC00261842-01,3.838000e-08,0.022434,0.782527,0.742158,0.178172,0.173317,0.033886,0.169590,0.151955,0.105279,...,0.012993,0.012214,0.098637,0.993249,0.955067,0.057505,0.011489,0.023847,0.000140,0.226269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCGC00357168-01,1.000000e-08,0.038929,0.599925,0.246331,0.043519,0.038314,0.000000,0.022882,0.013918,0.000000,...,0.000256,0.000578,0.018569,0.999713,0.993413,0.013657,0.000617,0.005536,0.000013,0.111147
NCGC00357283-01,2.714000e-08,0.013148,0.749906,0.527482,0.117176,0.115798,0.005907,0.095839,0.080535,0.030828,...,0.004334,0.007400,0.037757,0.998458,0.979364,0.045173,0.005385,0.017875,0.000070,0.183334
NCGC00357210-01,9.050000e-09,0.033198,0.749906,0.573813,0.055716,0.052455,0.000000,0.048756,0.045081,0.023726,...,0.000517,0.000408,0.025334,0.999671,0.994989,0.011548,0.000437,0.005116,0.000028,0.105349
NCGC00357118-01,3.186000e-08,0.030749,0.821147,0.733813,0.159333,0.157683,0.030030,0.164267,0.159179,0.113943,...,0.009635,0.008028,0.082346,0.996334,0.976904,0.042581,0.005533,0.020194,0.000137,0.203271


**Test Targets**

In [16]:
path_y_test = "../data/y_test.csv"
y_test = pd.read_csv(path_y_test).set_index("Unnamed: 0")
y_test

Unnamed: 0_level_0,NR.AhR,NR.AR,NR.AR.LBD,NR.Aromatase,NR.ER,NR.ER.LBD,NR.PPAR.gamma,SR.ARE,SR.ATAD5,SR.HSE,SR.MMP,SR.p53
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
NCGC00261900-01,0.0,1.0,,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0
NCGC00260869-01,0.0,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00261776-01,1.0,1.0,0.0,,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
NCGC00261380-01,,0.0,,1.0,0.0,,,1.0,0.0,,0.0,
NCGC00261842-01,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
NCGC00357168-01,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.0,0.0,0.0,0.0
NCGC00357283-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NCGC00357210-01,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,0.0
NCGC00357118-01,0.0,0.0,0.0,,,0.0,,1.0,0.0,,,1.0
