In [1]:
import gc
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
def one_hot_encode_categorical(cat_features, cat_names):
    enc = OneHotEncoder(sparse=False)
    encoded_df = pd.DataFrame(enc.fit_transform(cat_features), columns=enc.get_feature_names(cat_names), index=cat_features.index)
    return encoded_df

In [3]:
def generate_features(df_file_path, test=False):
    # read in data and set index to customer ID
    df = pd.read_parquet(df_file_path)
    df = df.set_index('customer_ID')
    
    # get X and y; drop dates from X 
    X = df.drop('S_2', axis=1)
    
    # delete original dataframe from memory 
    del df
    gc.collect()
    
    # encode categorical features
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    encoded_df = one_hot_encode_categorical(X[cat_features], cat_features)
    
    # fill empty numerical columns with mean 
    X = X.drop(cat_features, axis=1)
    X = X.apply(lambda col: col.fillna(col.mean()))
    
    # get final encoded and imputed features
    X = pd.concat([X, encoded_df], axis=1)

    return X 
    

In [4]:
X_test = generate_features('/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet', test=True)

# these values are not present in test set but are in training
# set all to 0 to work in .predict()
X_test['D_64_-1'] = 0.0
X_test['D_64_1'] = 0.0
X_test['D_66_0'] = 0.0
X_test['D_68_0'] = 0.0
X_test = X_test.reindex(sorted(X_test.columns), axis=1)

X_test = X_test.reset_index()
X_test.to_parquet('X_test.parquet')

del X_test
gc.collect()




0