In [1]:
import gc
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
def one_hot_encode_categorical(cat_features, cat_names):
    enc = OneHotEncoder(sparse=False)
    encoded_df = pd.DataFrame(enc.fit_transform(cat_features), columns=enc.get_feature_names(cat_names), index=cat_features.index)
    return encoded_df

In [3]:
def simple_impute_numerical(numeric_features, numeric_names):
    # current numeric columns are float16, and they will not work when computing mean()
    # need to convert to float32
    for column in numeric_features.columns:
        numeric_features[column] = numeric_features[column].astype(np.float32)

    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    numeric_df = pd.DataFrame(imp_mean.fit_transform(numeric_features), columns=numeric_names, index=numeric_features.index)
    
    # convert back to float16 for lighter load
    for column in numeric_df.columns:
        numeric_df[column] = numeric_df[column].astype(np.float16)
    
    return numeric_df

In [4]:
def generate_x_y(df_file_path, test=False):
    # read in data and set index to customer ID
    df = pd.read_feather(df_file_path)
    df = df.set_index('customer_ID')
    
    # get X and y; drop dates from X 
    X = df.drop('S_2', axis=1) if test else df.drop(['S_2', 'target'], axis=1)
    y = None if test else df['target']
    
    # delete original dataframe from memory 
    del df
    gc.collect()
    
    # encode categorical features
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    encoded_df = one_hot_encode_categorical(X[cat_features], cat_features)
    
    
    X = X.drop(cat_features, axis=1)
    X = simple_impute_numerical(X, list(X.columns))
    
    # get final encoded and imputed features
    X = pd.concat([X, encoded_df], axis=1)

    if test: 
        return X
    else: 
        return (X, y)
    
    

In [5]:
X_train, y_train = generate_x_y('../input/amexfeather/train_data.ftr')

# sort columns for matching with test set 
X_train = X_train.reindex(sorted(X_train.columns), axis=1)

X_train = X_train.reset_index()
y_train = y_train.reset_index()
X_train.to_feather('X_train.parquet')
y_train.to_feather('y_train.parquet')

del X_train, y_train
gc.collect()



0