# Preprocessing Data for Clustering

In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

from pathlib import Path
from tqdm import tqdm
from joblib import Parallel, delayed

from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

## Loading

In [None]:
data_root = Path('/home/c44406/datasets/atb_uofa/loan_portfolio/')
data_path = data_root / 'datasetv7*.csv'

dtype = {
    'dunning_level': 'object',
    'SUB_SYSTEM_FD': 'float64',
    'SUB_SYSTEM_IN': 'float64',
    'SUB_SYSTEM_RF': 'float64',
    'SUB_SYSTEM_RP': 'float64',
    'SUB_SYSTEM_SP': 'float64',
    'SUB_SYSTEM_TF': 'float64',
    'n_transactions': 'float64',
    'transaction_type_Bank_Trsf_Deposit_Acct': 'float64',
    'transaction_type_Customer_Transfer': 'float64',
    'transaction_type_Incoming_Wire': 'float64',
    'transaction_type_Loan_Disbursement': 'float64',
    'transaction_type_Outgoing_Wire': 'float64',
}
ddf = dd.read_csv(data_path, dtype=dtype)
datasets = ddf.compute()    # compile to pd.DataFrame

# import glob

# data_root = '/home/c44406/datasets/atb_uofa/loan_portfolio/'
# datasets_all = []
# for dpath in glob.glob(f'{data_root}datasetv7*.csv'):
#     df = pd.read_csv(dpath, dtype={'dunning_level':'object'})
#     datasets_all.append(df)

# datasets = pd.concat(datasets_all, axis=0, ignore_index=True)

datasets.head()

## Cleaning

In [None]:
start_date = '2008-01-31'
end_date = '2021-09-30'

cols = datasets.columns
other_cols = [    # non-feature columns
    'bus_ptnr_group',
     'cal_day',
     'naics_id',
     'has_loan',
]
used_cols = cols.str.startswith((
    'BRR',
    'impaired',
    'Oustanding_principle_on_posting_date',
    'percentage_rate',
    'abs_transactions',
    'n_transactions',
    'mth_since_brr_update',
    # 'defaults',
    'transactions',
    'SUB_SYSTEM',
    'transaction_type',
))
used_cols = cols[used_cols].to_list()

data = datasets[other_cols + used_cols].copy()
data['cal_day'] = pd.to_datetime(data['cal_day'], errors='coerce')
data = data[data['cal_day'].notnull()].copy()
data = data.replace(to_replace=[np.inf, -np.inf], value=[np.nan, np.nan])
data = data[(data['cal_day'] >= start_date) & (data['cal_day'] <= end_date)].copy()
data = data.sort_values('cal_day')
data = data.reset_index(drop=True)

used_cols

In [None]:
data.head()

## Scaling

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputing', SimpleImputer(strategy='median')),
    ('scaling', PowerTransformer(method='yeo-johnson', standardize=True)),
    ('masking', SimpleImputer(strategy='constant', fill_value=0.)),
])
preprocessor = ColumnTransformer(transformers=[
    ('num_col', numeric_transformer, used_cols),
])

data[used_cols] = preprocessor.fit_transform(data)

In [None]:
data.head()

## Adding Lagged Features

## Saving

In [None]:
data.columns.to_list()

In [None]:
data.to_pickle('cluster_data.pkl')