In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

from tab_net import TabNet

from data_processor import ( PandasCatLoader, 
                             get_feature_sizes, 
                             LabelEnc)

from run_test import ( set_seed, 
                       na_imputer,
                       RealNormalizer,
                       data_processing, 
                       make_dataset)

### SAVE LEARNED EMBEDDINGS 

In [10]:
params = dict()
params['TARGET'] = 'isFraud'
params['embed_scaler'] = 1/1.6  # division value
params['embed_exponent'] = 0.65
params['min_embed_size'] = 2
params['net_type'] = 'embeddings'  #'linear'
params["layer_norm"] = False
params["batch_norm"] = True
params["mid_features"] = 512
params["num_residual"] = 2
params["drop_out"] = 0
params['out_size'] = 2
params['cat_th'] = 50
params['batch_size'] = 512
params['lr'] = 0.0004
params['tags'] = ['Tabular NET','Fraud']
params['drop_na'] = 0.6

In [11]:
TRAIN_PATH = '../../../data_main/fraud/TRAIN.csv'
VAL_PATH = '../../../data_main/fraud/VAL.csv'


set_seed(44)
    
train, val, params = data_processing(TRAIN_PATH, VAL_PATH, params)
train_set, val_set =  make_dataset(train, val, params, shuffle=False, batch_size=params['batch_size'])

Nan in card2 555, imputing ...
Nan in card3 110, imputing ...
Nan in card5 609, imputing ...
Nan in addr1 36328, imputing ...
Nan in addr2 36328, imputing ...
Nan in D1 129, imputing ...
Nan in D4 47618, imputing ...
Nan in D6 46194, imputing ...
Nan in D8 41815, imputing ...
Nan in D9 41815, imputing ...
Nan in D10 45091, imputing ...
Nan in D12 51257, imputing ...
Nan in D13 49384, imputing ...
Nan in D14 49154, imputing ...
Nan in D15 45605, imputing ...
Nan in V12 45121, imputing ...
Nan in V13 45121, imputing ...
Nan in V14 45121, imputing ...
Nan in V15 45121, imputing ...
Nan in V16 45121, imputing ...
Nan in V17 45121, imputing ...
Nan in V18 45121, imputing ...
Nan in V19 45121, imputing ...
Nan in V20 45121, imputing ...
Nan in V21 45121, imputing ...
Nan in V22 45121, imputing ...
Nan in V23 45121, imputing ...
Nan in V24 45121, imputing ...
Nan in V25 45121, imputing ...
Nan in V26 45121, imputing ...
Nan in V27 45121, imputing ...
Nan in V28 45121, imputing ...
Nan in V29 

In [5]:
train.shape

(86701, 705)

In [16]:
train.shape

(89957, 13)

In [16]:
PATH = 'Tabular NET Fraud embeddings _Fraud.pth'
model = TabNet(params)
model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [9]:
params

{'TARGET': 'isFraud',
 'embed_scaler': 0.625,
 'embed_exponent': 0.65,
 'min_embed_size': 4,
 'net_type': 'embedidings',
 'layer_norm': False,
 'batch_norm': True,
 'mid_features': 512,
 'num_residual': 2,
 'drop_out': 0,
 'out_size': 2,
 'cat_th': 100,
 'batch_size': 512,
 'lr': 0.004,
 'tags': ['Tabular NET', 'Fraud'],
 'drop_na': 0.7,
 'real_features_size': 136,
 'num_categoical': 586,
 'cat_embed_sizes': {'V17_col_na_binary': {'embedding_dim': 4,
   'num_categories': 2},
  'D9_col_na_binary': {'embedding_dim': 4, 'num_categories': 2},
  'V48_col_na_binary': {'embedding_dim': 4, 'num_categories': 2},
  'V178_col_na_binary': {'embedding_dim': 4, 'num_categories': 2},
  'V223_col_na_binary': {'embedding_dim': 4, 'num_categories': 2},
  'V203_col_na_binary': {'embedding_dim': 4, 'num_categories': 2},
  'V155': {'embedding_dim': 11, 'num_categories': 22},
  'V236_col_na_binary': {'embedding_dim': 4, 'num_categories': 2},
  'V18_col_na_binary': {'embedding_dim': 4, 'num_categories': 2},


In [12]:
params['input_concat_vector_size']

2700

In [13]:
def get_features(data):
    x = []
    y = []

    rows = data.dataset.data.shape[0]
    columns = params['input_concat_vector_size']

    for batch in tqdm(data):
        model(batch)
        x.append(model.vector.detach().numpy())
        y.append(batch["target"].numpy())

    x = np.concatenate(x, axis=0)
    y = np.concatenate(y, axis=0)
    print(x.shape)
    print(f'Dimensions match: {x.shape == (rows, columns)}')
    return x,y

In [17]:
train_x, train_y = get_features(train_set)
np.save('train_x.pkl',train_x)
np.save('train_y.pkl',train_y)

100%|██████████| 170/170 [05:12<00:00,  1.84s/it]


(86701, 2700)
Dimensions match: True


In [18]:
val_x, val_y = get_features(val_set)
np.save('val_x.pkl',val_x)
np.save('val_y.pkl',val_y)

100%|██████████| 57/57 [01:52<00:00,  1.98s/it]


(28795, 2700)
Dimensions match: True
