Data preprocessing for loading train and test data, then splitting data.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras

Using TensorFlow backend.


In [2]:
def load_data(filename1, filename2):
    """ Loads in training and test data, then splits and saves
    training and test data. """
    df_train = pd.read_csv(filename1)
    new_df_train = df_train.copy()

    df_test = pd.read_csv(filename2)
    new_df_test = df_test.copy()
    
    # remove features in training set where std dev of values is 0
    for feature in new_df_train.columns.values:
        col_vals = new_df_train[feature].values
        if np.std(col_vals) == 0:
            new_df_train.drop(feature, axis=1, inplace=True)
            new_df_test.drop(feature, axis=1, inplace=True)
        
    # data splits
    x = new_df_train.loc[:, new_df_train.columns != 'target'].values
    y = new_df_train['target'].values
    x_test = new_df_test.values
    
    # Shuffle order of data
    rand_order = np.arange(len(x))
    np.random.seed(42)
    np.random.shuffle(rand_order)

    new_x = [x[i] for i in range(len(rand_order))]
    new_y = [y[i] for i in range(len(rand_order))]

    # save training and test data
    np.save("data/x.npy", np.array(new_x))
    np.save("data/y.npy", np.array(new_y))
    np.save("data/x_test.npy", x_test)

In [3]:
load_data("data/train_2008.csv", "data/test_2008.csv")