In [1]:
import pickle
import gc
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder

In [2]:
def one_hot_encode(cat_df):
    """
    One-hot encodes categorical features using scikit-learn OneHotEncoder

    Parameters
    ----------
    cat_df: pd.DataFrame
        DataFrame, with index, that has only the categorical columns to one-hot encode

    Returns
    -------
    pd.DataFrame
        DataFrame that holds each of the one-hot encoded columns 
    """    
    
    enc = OneHotEncoder(sparse=False)
    encoded_df = pd.DataFrame(enc.fit_transform(cat_df), columns=enc.get_feature_names(['D_63_last', 'D_64_last']), index=cat_df.index)
    return encoded_df

In [3]:
def impute_helper(col):
    """
    Function to be passed into .apply() to help with imputing the different types of columns.

    Parameters
    ----------
    col: pd.Series
        A column of the DataFrame to be imputed

    Returns
    -------
    pd.Series
        New column imputed with either most common value or mean(), instead of NaNs 
    """        
    
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    convert_dtype = False
    
    # convert float16's to float32 to calculate means without overflow 
    if col.dtype == 'float16':
        convert_dtype = True
        col = col.astype('float32')
    
    # if the column was originally a categorical feature then fill with the most common value
    # otherwise fill with mean()
    if '_'.join(col.name.split("_", 2)[:2]) in cat_features:
        col = col.fillna(col.value_counts().idxmax())
    else: 
        col = col.fillna(col.mean())
        
    # convert float16s back 
    if convert_dtype:
        col = col.astype('float16')
        
    return col

In [4]:
def impute_columns(df):
    """
    Fills NaN values for Aggregate data. Categorical columns are filled with most common value 
    and numerical are filled with mean.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame, with index, that should be imputed

    Returns
    -------
    pd.DataFrame
        DataFrame with no NaN values
    """

    # if the column is a categorical feature fill with the most common value, fill with mean() if column is numerical
    df = df.apply(impute_helper)
    
    return df

In [5]:
def generate_x_y(df_file_path, test=False):
    """
    Returns the features (X) and targets (y) for the given data file

    Parameters
    ----------
    df_file_path : string
        File path to generate DataFrame from 
    test : boolean
        Whether or not the provided data file is the test set
        False = training set 
        True = test set 

    Returns
    -------
    pd.DataFrame
        If it is the test dataset it will return only the features (X)
        
    OR 
    
    Tuple(pd.DataFrame, pd.DataFrame)
        If it is the training set it will return the features and targets in a tuple (X, y)
    """    
    
    df = pd.read_pickle(df_file_path, compression='gzip')
    y = None if test else df['target']
    
    # D_63_last and D_64_last columns are of type 'category', these are the only columns that need to be one-hot encoded
    # the other, original, categorical features are already modified from the aggregate functions
    encoded_df = one_hot_encode(df[['D_63_last', 'D_64_last']])
    
    # impute with numerical columns with mean() and categorical columns with most common value
    X = impute_columns(df.drop(['D_63_last', 'D_64_last'], axis=1) if test else df.drop(['D_63_last', 'D_64_last', 'target'], axis=1))
    
    del df
    gc.collect()
    
    # combine new dataframes and sort them to line up when training/predicting
    X = pd.concat([X, encoded_df], axis=1)
    X.reindex(sorted(X.columns), axis=1)
    
    if test: 
        return X
    else: 
        return (X, y)

In [6]:
X_train, y_train = generate_x_y('/kaggle/input/amex-agg-data-pickle/train_agg.pkl')

display(X_train.head())

X_train.to_pickle('X_train_agg.pkl', compression='gzip')
y_train.to_pickle('y_train_agg.pkl', compression='gzip')

del X_train, y_train
gc.collect()



Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_63_last_CO,D_63_last_CR,D_63_last_XL,D_63_last_XM,D_63_last_XZ,D_64_last_-1,D_64_last_O,D_64_last_R,D_64_last_U,D_64_last_nan
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.933594,0.024194,0.868652,0.960449,0.93457,0.010704,0.02444,0.001082,0.091492,0.009117,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.899902,0.022097,0.861328,0.929199,0.880371,0.21521,0.199123,0.002224,0.567383,0.178101,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.878418,0.028837,0.797852,0.904297,0.880859,0.004181,0.002759,0.000802,0.009705,0.009705,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.599121,0.020082,0.567383,0.623535,0.621582,0.048859,0.08849,0.00066,0.268555,0.001082,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.891602,0.042316,0.805176,0.94043,0.87207,0.004642,0.002883,3e-05,0.008682,0.005573,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


0

In [7]:
X_test = generate_x_y('/kaggle/input/amex-agg-data-pickle/test_agg.pkl', test=True)
display(X_test.head())
X_test.to_pickle('X_test_agg.pkl', compression='gzip')

del X_test
gc.collect()



Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_63_last_CL,D_63_last_CO,D_63_last_CR,D_63_last_XL,D_63_last_XM,D_63_last_XZ,D_64_last_O,D_64_last_R,D_64_last_U,D_64_last_nan
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.601562,0.020198,0.568848,0.631348,0.568848,0.069763,0.103129,0.001912,0.23877,0.121399,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.862305,0.031432,0.794434,0.913574,0.841309,0.154297,0.177177,0.001548,0.505859,0.126465,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.749023,0.061435,0.67334,0.834961,0.697754,0.181885,0.264004,0.000482,0.679688,0.002724,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.474609,0.028883,0.428467,0.51416,0.513184,0.470459,0.128948,0.212524,0.683594,0.324707,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.323975,0.049886,0.254395,0.425781,0.254395,0.353027,0.196611,0.035065,0.768066,0.768066,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


0