### In this notebook, we train an Autoregressive RNN for predicting the future profiles of customers. The model will be trained to predict the future profiles of a customer over the course of the next 13 months, utilizing the profiles from the past as input. The training process will be self-supervised, as we will not be utilizing the target column "default or not". The predicted future profiles will serve as strong features for predicting a customer's likelihood of defaulting in the future. Specifically, we will:

- Create datasets and models for the Autoregressive RNN
- Utilize PyTorch Lightning to train the model
- Use the trained model to generate predictions of future customer profiles
- Compile and save the final model for triton inference server.

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '7'

import numpy as np
from torch import nn
import pytorch_lightning as pl
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from random import randint
import torch
import yaml
from collections import namedtuple
import cudf
from tqdm import tqdm
import gc
from pathlib import Path

In [2]:
PATH = '/raid/data/ml/kaggle/amex'
os.listdir(PATH)

['test.parquet',
 'train_labels.csv.zip',
 'train_labels.csv',
 'amex-data-integer-dtypes-parquet-format.zip',
 'train.parquet']

### Utility functions

In [3]:
def toT(x):
    if isinstance(x, np.ndarray):
        return torch.from_numpy(x).float()
    else:
        return torch.tensor(x).float()

def dual_log1p(x):
    x = x.astype('float32')
    sign = np.sign(x)
    y = np.log1p(np.abs(x))
    return sign*y

def dict_to_namedtuple(dic):
    return namedtuple('Config', dic.keys())(**dic)

def load_yaml_to_dict(path):
    with open(path) as f:
        x = yaml.safe_load(f)
    res = {}
    for i in x:
        res[i] = x[i]['value']
    return res
    
def load_yaml(path):
    res = load_yaml_to_dict(path)
    config = dict_to_namedtuple(res)
    print(config)
    return config

def get_cat_cols():
    return ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

def get_cus_count(df):
    if 'cid' not in df.columns:
        df['cid'],_ = df['customer_ID'].factorize()
    dg = df.groupby('cid').agg({'S_2':'count'})
    dg.columns = ['cus_count']
    dg = dg.reset_index()
    dg = dg.sort_values('cid')
    dg['cus_count'] = dg['cus_count'].cumsum()
    return dg

### Datasets

In [4]:
class RnnDataset(Dataset):

    def __init__(self, df, config):
        self.S = config.seq

        dg = get_cus_count(df)
        self.ids = dg.cus_count.values.astype('int32').get()

        df = self._remove_cols(df)
        target_cols = self._get_target_cols(df, config.tcols)
        self.xids,self.yids = self._get_x_y_cols_ids(df, target_cols)
        self._set_y_mask(df)
        df = self._normalize(df)
        self.data = df.to_pandas().values

    def _get_target_cols(self, df, tcols):
        cols = tcols.split(',')
        if len(cols) == 1 and cols[0] == 'all':
            cat = get_cat_cols()
            return [i for i in df.columns if i not in cat]
        return cols

    def __len__(self):
        return self.ids.shape[0]

    def _set_y_mask(self,df):
        self.tcols = df.columns.values[self.yids]
        #print(self.tcols)
        self.y_mask = df[self.tcols].isnull()

    def get_x_y_dims(self):
        return len(self.xids), len(self.yids)

    def _pad(self,x):
        s = self.S
        mask = np.ones(s)
        if s < x.shape[0]:
            return x[-s:],mask
        m,n = x.shape
        tmp = np.zeros((s-m,n))
        mask[:s-m] = 0
        return np.vstack([tmp,x]),mask

        
    def _remove_cols(self,df):
        not_used = [i for i in df.columns if df[i].dtype=='O']+['cid']
        print("RnnDataset not used columns:")
        print(not_used)
        cat_cols = get_cat_cols()
        return df.drop(not_used+cat_cols, axis=1)

    def _normalize(self, df):
        for col in df.columns:
            df[col] = dual_log1p(((df[col].fillna(0)*100).astype('int16')*0.01).values)
        return df

    def _get_x_y_cols_ids(self, df, target_cols):
        xids,yids = [],[]
        for c,i in enumerate(df.columns.values):
            if i in target_cols:
                yids.append(c)
            else:
                xids.append(c)
        return xids,yids
    
class TrainRnnDataset(RnnDataset):

    def __getitem__(self, idx):
        if idx == 0:
            s = 0
        else:
            s = self.ids[idx-1].item()
        e = self.ids[idx].item()
        data = self.data[s:e]
        y = data[:,self.yids]

        if y.shape[0] > 2:
            x,mask = self._pad(y[:-2])
            y,_ = self._pad(y[1:-1])
        else:
            x,mask = self._pad(y)
            y = x
        return toT(x),toT(y),toT(mask)

class ValidRnnDataset(RnnDataset):

    def __getitem__(self, idx):
        if idx == 0:
            s = 0
        else:
            s = self.ids[idx-1].item()
        e = self.ids[idx].item()
        data = self.data[s:e]
        y = data[:,self.yids]

        if y.shape[0] > 1:
            x,mask = self._pad(y[:-1])
            y,_ = self._pad(y[1:])
        else:
            x,mask = self._pad(y)
            y = x
        return toT(x),toT(y),toT(mask)
    
class TestRnnDataset(RnnDataset):

    def __getitem__(self, idx):
        if idx == 0:
            s = 0
        else:
            s = self.ids[idx-1].item()
        e = self.ids[idx].item()
        data = self.data[s:e]
        y = data[:,self.yids]
        x,_ = self._pad(y)
        return toT(x)

In [5]:
train = cudf.read_parquet(f'{PATH}/train.parquet')
print(train.shape)
train.head()

(5531451, 190)


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


Note that this dataframe doesn't have `target` column so training our RNN is completely self-supervised.

In [6]:
config = load_yaml('rnn.yaml')

Config(model='rnn', epochs=5, batch_size=512, seq=5, H1=512, H2=128, layers=1, E=192, dropout=0, lr=0.001, wd=0.0, tcols='all')


In [7]:
%%time

train_ds = TrainRnnDataset(train,config)
valid_ds = ValidRnnDataset(train,config)
test_ds = TestRnnDataset(train,config)

RnnDataset not used columns:
['customer_ID', 'S_2', 'cid']
RnnDataset not used columns:
['customer_ID', 'S_2', 'cid']
RnnDataset not used columns:
['customer_ID', 'S_2', 'cid']
CPU times: user 6.11 s, sys: 8.02 s, total: 14.1 s
Wall time: 14.1 s


In [8]:
del train
gc.collect()

421

### Model Definition

In [9]:
class RNN(pl.LightningModule):
    def __init__(self, x_dim, y_dim, config):
        super(RNN, self).__init__()        
        
        self.config = config
        H = config.H1
        self.gru = nn.GRU(input_size=y_dim, hidden_size=H, 
                          batch_first=True,bidirectional=False, 
                          num_layers=config.layers, dropout=config.dropout)
        self.out = nn.Linear(H, y_dim)
    
    def forward(self, x):
        x0 = x
        x,_ = self.gru(x)
        x = F.relu(x)
        x = self.out(x)
        return x + x0

    def _f(self, batch):
        if len(batch) == 3:
            x,y,mask = batch
            return self(x),x,y,mask
        else:
            assert 0
            
    def training_step(self, batch, batch_nb):
        return self._loss(batch, tag='train')

    def validation_step(self, batch, batch_nb):
        return self._loss(batch, tag='valid', naive=True)

    def predict_step(self, batch, batch_nb):
        yp,_,_,_ = self._f(batch)
        return yp

    def _loss(self, batch, tag, naive=False):
        yp,x2,y2,mask = self._f(batch)
        loss = self._compute_loss(yp,y2,mask,tag)
        if naive:
            self._compute_loss(x2,y2,mask,'naive')
        return loss

    def _compute_loss(self,yp,y2,mask,tag):
        loss = ((yp-y2)**2).mean(dim=-1)
        loss = (loss*mask).sum()/mask.sum()
        lossp = F.mse_loss(yp[:,-1,:],y2[:,-1,:])
        self.log(f'{tag}', loss, prog_bar=True)
        self.log(f'{tag}_last', lossp, prog_bar=True)
        return loss

    def configure_optimizers(self):
        config = self.config
        adam = torch.optim.Adam(self.parameters(), lr=config.lr, 
                                weight_decay=config.wd)
        slr = torch.optim.lr_scheduler.CosineAnnealingLR(adam, 
                                                         config.epochs)
        return [adam], [slr]
            
class AutoRegressiveRNN(nn.Module):
    
    def __init__(self, x_dim, y_dim, config):
        super(AutoRegressiveRNN, self).__init__()        
        
        self.config = config
        H = config.H1
        self.gru = nn.GRU(input_size=y_dim, hidden_size=H, 
                          batch_first=True,bidirectional=False, 
                          num_layers=config.layers, dropout=config.dropout)
        self.out = nn.Linear(H, y_dim)

    def f(self, x):
        x0 = x
        x,_ = self.gru(x)
        x = F.relu(x)
        x = self.out(x)
        return x + x0
        
    def forward(self, x):
        res = []
        for i in range(13):
            p = self.f(x)
            res.append(p[:,-1,:])
            x = torch.cat([x[:,1:,:],p[:,-1:,:]],dim=1)
        yp = torch.stack(res,dim=1)
        return yp

### Training

In [10]:
x_dim, y_dim = train_ds.get_x_y_dims()
print('x_dim, y_dim', x_dim, y_dim)
model = RNN(x_dim,y_dim,config)

x_dim, y_dim 0 177


In [11]:
batch_size = config.batch_size
cpu_workers = 4
train_dl = DataLoader(train_ds, batch_size=batch_size,
                shuffle=True, num_workers=cpu_workers,
                drop_last=True)

valid_dl = DataLoader(valid_ds, batch_size=batch_size,
                shuffle=False, num_workers=cpu_workers,
                drop_last=False)

test_dl = DataLoader(test_ds, batch_size=batch_size,
                    shuffle=False, num_workers=cpu_workers,
                    drop_last=False)

In [12]:
EPOCHS = config.epochs
msgs = {}
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='valid_last', mode='min')
trainer = pl.Trainer(gpus=1, max_epochs=EPOCHS,
                     callbacks=[checkpoint_callback],
                     )

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
trainer.fit(model, train_dataloaders=train_dl, 
                val_dataloaders=valid_dl)

Missing logger folder: /home/nfs/jiweil/rapids/triton_amex/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [7]

  | Name | Type   | Params
--------------------------------
0 | gru  | GRU    | 1.1 M 
1 | out  | Linear | 90.8 K
--------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.609     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [14]:
del trainer, model

### Inference and generate RNN features

In [15]:
model = AutoRegressiveRNN(x_dim,y_dim,config)

In [16]:
weights = torch.load('lightning_logs/version_0/checkpoints/epoch=4-step=4480.ckpt')

In [17]:
model.load_state_dict(weights['state_dict'],strict=True)

<All keys matched successfully>

In [18]:
model.forward

<bound method AutoRegressiveRNN.forward of AutoRegressiveRNN(
  (gru): GRU(177, 512, batch_first=True)
  (out): Linear(in_features=512, out_features=177, bias=True)
)>

In [19]:
model.eval()
model.cuda()

AutoRegressiveRNN(
  (gru): GRU(177, 512, batch_first=True)
  (out): Linear(in_features=512, out_features=177, bias=True)
)

In [20]:
len(test_dl)

897

In [21]:
%%time

res = []
for x in tqdm(test_dl, total=len(test_dl)):
    x = x.cuda()
    yp = model(x)
    res.append(yp.detach().cpu().numpy())

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 897/897 [00:13<00:00, 67.97it/s]

CPU times: user 9.91 s, sys: 3.12 s, total: 13 s
Wall time: 13.2 s





In [22]:
rnn_feas = np.concatenate(res)
rnn_feas.shape

(458913, 13, 177)

In [23]:
np.save('rnn_feas.npy',rnn_feas)

In [24]:
x.shape

torch.Size([161, 5, 177])

### Write config file and compile the model

In [25]:
def generate_config(model_name, in_seq_len, num_feas, out_seq_len):

    config_text = f"""name: "{model_name}"
platform: "pytorch_libtorch"
max_batch_size : 42134
input [
  {{
    name: "input__0"
    data_type: TYPE_FP32
    dims: [{in_seq_len}, {num_feas}]
  }}
]
output [
  {{
    name: "output__0"
    data_type: TYPE_FP32
    dims: [{out_seq_len}, {num_feas}]
  }}
]

dynamic_batching {{
  max_queue_delay_microseconds: 100
}}"""
    config_path = os.path.join(model_name, 'config.pbtxt')
    with open(config_path, 'w') as file_:
        file_.write(config_text)

    return config_path
# reshape {{ shape: [ 1, {in_seq_len}, {num_feas}]}}

In [26]:
model_name='AutoRegressiveRNN'
Path(f'{model_name}/1').mkdir(parents=True, exist_ok=True)

generate_config(model_name=model_name, 
                in_seq_len=x.shape[1], 
                num_feas=x.shape[2], 
                out_seq_len=yp.shape[1])

'AutoRegressiveRNN/config.pbtxt'

In [27]:
%%time

traced_script_module = torch.jit.trace(model, x[:1,:,:])
traced_script_module.save(f"{model_name}/1/model.pt")

CPU times: user 318 ms, sys: 2.91 ms, total: 321 ms
Wall time: 326 ms
