## Script to understand the effect of data preprocessing.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import Imputer, RobustScaler, StandardScaler

In [2]:
# Here's an example of loading the CSV using Pandas's built-in HDF5 support:
with pd.HDFStore("train.h5", "r") as train:
    # Note that the "train" dataframe is the only dataframe in the file
    df = train.get("train")
    train_df = df[:806298]
    valid_df = df[806298:]

def PlotHistogram(train_x, valid_x, train_impute, valid_impute, train_norm, valid_norm, column):
    # Creates a new figure for plotting.
    (fig, axes) = plt.subplots(3, 2, figsize=(16, 24))
    
    train_x[column].dropna().hist(bins=100, log=True, ax=axes[0, 0])
    axes[0, 0].set_title('Original train: ' + column)
    
    valid_x[column].dropna().hist(bins=100, log=True, ax=axes[0, 1])
    axes[0, 1].set_title('Original valid: ' + column)
    
    train_impute[column].hist(bins=100, log=True, ax=axes[1, 0])
    axes[1, 0].set_title('Impute train: ' + column)
    
    valid_impute[column].hist(bins=100, log=True, ax=axes[1, 1])
    axes[1, 1].set_title('Impute valid: ' + column)

    train_norm[column].hist(bins=100, log=True, ax=axes[2, 0])
    axes[2, 0].set_title('Norm train: ' + column)
    
    valid_norm[column].hist(bins=100, log=True, ax=axes[2, 1])
    axes[2, 1].set_title('Norm valid: ' + column)
    
    plt.show()

X_train = train_df.drop(['id', 'y', 'timestamp'], axis=1)
X_valid = valid_df.drop(['id', 'y', 'timestamp'], axis=1)

imputer = Imputer(strategy='mean')
train_impute = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
valid_impute = pd.DataFrame(imputer.transform(X_valid), columns=X_train.columns)

normalizer = RobustScaler()
train_norm = pd.DataFrame(normalizer.fit_transform(train_impute), columns=X_train.columns)
valid_norm = pd.DataFrame(normalizer.transform(valid_impute), columns=X_train.columns)

# for (ind, column) in enumerate(X_train.columns):
#     print(column, ': ', normalizer.center_[ind], normalizer.scale_[ind])

#for column in X_train.columns:
#    if 'technical_' in column:
#        PlotHistogram(X_train, X_valid, train_impute, valid_impute, train_norm, valid_norm, column)