In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv
/kaggle/input/amex-default-prediction-feather/00_slim_data.ipynb
/kaggle/input/amex-default-prediction-feather/test.feather
/kaggle/input/amex-default-prediction-feather/train.feather
/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet
/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet


In [2]:
# Reduce Memory Usage

In [3]:
def reduce_memory_usage(df,verbose = True):
    numerics = ['int16','int32','int64','float16','float32','float64']
    start_mem = df.memory_usage().sum()/1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
# Reading Preprocessed Data

In [5]:
%%time

amex_train = pd.read_feather("../input/amex-default-prediction-feather/train.feather")

print("Train size:", amex_train.shape)

Train size: (5531451, 190)
CPU times: user 10.9 s, sys: 3.31 s, total: 14.3 s
Wall time: 19.1 s


In [6]:
amex_train = reduce_memory_usage(amex_train)
amex_features = amex_train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
num_features = [col for col in amex_features if col not in cat_features]

Memory usage after optimization is: 2009.85 MB
Decreased by 0.0%


In [7]:
print('Starting train feature extraction')
amex_train_agg = amex_train.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
amex_train_agg.columns = ['_'.join(x) for x in amex_train_agg.columns]
amex_train_agg.reset_index(inplace = True)

Starting train feature extraction


In [8]:
for col in amex_train_agg:
    if 'last' in col and col.replace('last', 'first') in amex_train_agg:
        amex_train_agg[col + '_lag_sub'] = amex_train_agg[col] - amex_train_agg[col.replace('last', 'first')]
        amex_train_agg[col + '_lag_div'] = amex_train_agg[col] / amex_train_agg[col.replace('last', 'first')]

In [9]:
train_cat_agg = amex_train.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
train_cat_agg.reset_index(inplace = True)

In [10]:
%%time

amex_train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
amex_train = amex_train_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(amex_train_labels, how = 'inner', on = 'customer_ID')
print('Train shape: ', amex_train.shape)    
del amex_train_agg, train_cat_agg        
gc.collect()

Train shape:  (458913, 1462)
CPU times: user 7min 31s, sys: 2min 42s, total: 10min 14s
Wall time: 10min 15s


0

In [11]:
amex_test = pd.read_feather("../input/amex-default-prediction-feather/train.feather")
amex_test = reduce_memory_usage(amex_test)
print('Starting test feature extraction')
test_num_agg = amex_test.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
test_num_agg.reset_index(inplace = True)

Memory usage after optimization is: 2009.85 MB
Decreased by 0.0%
Starting test feature extraction


In [12]:
for col in test_num_agg:
    if 'last' in col and col.replace('last', 'first') in test_num_agg:
        test_num_agg[col + '_lag_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'first')]
        test_num_agg[col + '_lag_div'] = test_num_agg[col] / test_num_agg[col.replace('last', 'first')]

In [13]:
test_cat_agg = amex_test.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
test_cat_agg.reset_index(inplace = True)

In [14]:
%%time

amex_test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID')
print('Test shape: ', amex_test.shape)
del test_num_agg, test_cat_agg
gc.collect()

Test shape:  (458913, 1461)
CPU times: user 7min 24s, sys: 2min 5s, total: 9min 30s
Wall time: 9min 31s


0