In [1]:
import gc
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

## Helper functions to impute, encode, and generate features/targets

In [2]:
def one_hot_encode_categorical(cat_features, cat_names):
    """
    One-hot encodes categorical features using scikit-learn OneHotEncoder

    Parameters
    ----------
    cat_features : pd.DataFrame
        DataFrame, with index, that has only the categorical columns to one-hot encode
    cat_names : list
        list of categorical column names 

    Returns
    -------
    pd.DataFrame
        DataFrame that holds each of the one-hot encoded columns 
    """
    
    enc = OneHotEncoder(sparse=False)
    encoded_df = pd.DataFrame(enc.fit_transform(cat_features), columns=enc.get_feature_names(cat_names), index=cat_features.index)
    return encoded_df

In [3]:
def simple_impute_numerical(numeric_features, numeric_names):
    """
    Imputes numerical columns with scikit-learn SimpleImputer()

    Parameters
    ----------
    numeric_features : pd.DataFrame
        DataFrame, with index, that has only the numerical columns to impute
    numeric_features : list
        list of numerical column names 

    Returns
    -------
    pd.DataFrame
        DataFrame that holds each of the imputed numerical columns
    """
    
    # current numeric columns are float16, and they will not work when computing mean()
    # need to convert to float32
    for column in numeric_features.columns:
        numeric_features[column] = numeric_features[column].astype(np.float32)

    # impute columns using the mean
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    numeric_df = pd.DataFrame(imp_mean.fit_transform(numeric_features), columns=numeric_names, index=numeric_features.index)
    
    # convert back to float16 for lighter load
    for column in numeric_df.columns:
        numeric_df[column] = numeric_df[column].astype(np.float16)
    
    return numeric_df

In [4]:
def generate_x_y(df_file_path, test=False):
    """
    Returns the features (X) and targets (y) for the given data file

    Parameters
    ----------
    df_file_path : string
        File path to generate DataFrame from 
    test : boolean
        Whether or not the provided data file is the test set
        False = training set 
        True = test set 

    Returns
    -------
    pd.DataFrame
        If it is the test dataset it will return only the features (X)
        
    OR 
    
    Tuple(pd.DataFrame, pd.DataFrame)
        If it is the training set it will return the features and targets in a tuple (X, y)
    """
    
    # read in data and set index to customer ID
    df = pd.read_feather(df_file_path)
    df = df.set_index('customer_ID')
    
    # get X and y; drop dates from X 
    X = df.drop('S_2', axis=1) if test else df.drop(['S_2', 'target'], axis=1)
    y = None if test else df['target']
    
    # delete original dataframe from memory 
    del df
    gc.collect()
    
    # encode categorical features
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    encoded_df = one_hot_encode_categorical(X[cat_features], cat_features)
    
    # simple impute numerical columns with mean()
    X = X.drop(cat_features, axis=1)
    X = simple_impute_numerical(X, list(X.columns))
    
    # get final encoded and imputed features
    X = pd.concat([X, encoded_df], axis=1)

    if test: 
        return X
    else: 
        return (X, y)
    
    

## Generate the features and targets, and then save to .ftr file

In [5]:
X_train, y_train = generate_x_y('../input/amexfeather/train_data.ftr')

# sort columns for matching with test set 
X_train = X_train.reindex(sorted(X_train.columns), axis=1)

# feather files do not support indexing
X_train = X_train.reset_index()
y_train = y_train.reset_index()
X_train.to_feather('X_train.ftr')
y_train.to_feather('y_train.ftr')

del X_train, y_train
gc.collect()



0