In [1]:
import sys
sys.path.append('../')

In [2]:
import copy
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from swapdae.model import AutoEncoder
from swapdae.utils import MultiColumnLabelEncoder
from swapdae.data import swap_dataframe, SwapNoiseDataset

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler


In [3]:
data = pd.read_csv('adult.csv')
data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [4]:
swap_data = swap_dataframe(data, prob=0.3)
swap_data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,13,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,>50K
2,28,Local-gov,47541,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Female,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,9,Never-married,Exec-managerial,Own-child,White,Female,0,0,40,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Husband,White,Male,0,0,40,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Husband,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Some-college,13,Married-civ-spouse,Prof-specialty,Not-in-family,White,Male,3103,0,60,United-States,>50K
8,30,Private,369667,11th,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,69579,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States,<=50K


In [5]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, num_indices, cate_indices, scaler=None):
        self.num_indices = np.array(num_indices)
        self.cate_indices = cate_indices
        self.scaler = StandardScaler() if scaler is None else scaler
        self.label_encoder = MultiColumnLabelEncoder(self.cate_indices)
        
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = X.values
            
        self.scaler.fit(X[:, self.num_indices])
        self.label_encoder.fit(X)
        return self
    
    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = X.values
        X_transformed = copy.deepcopy(X)
        X_transformed[:, self.num_indices] = self.scaler.transform(X[:, self.num_indices])
        X_transformed = self.label_encoder.transform(X_transformed)
        return X_transformed

In [6]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [7]:
preprocessor = Preprocessor(num_indices=[0, 2, 4, 10, 11, 12], cate_indices=[1, 3, 5, 6, 7, 8, 9, 13])
X_train = preprocessor.fit_transform(X)

In [8]:
train_dataset = SwapNoiseDataset(pd.DataFrame(X_train))
train_dataset[0]

(array([-9.95128932e-01,  4.00000000e+00,  3.51674526e-01,  1.00000000e+00,
        -1.19725891e+00,  4.00000000e+00,  3.00000000e+00,  3.00000000e+00,
         2.00000000e+00,  1.00000000e+00, -1.44803531e-01, -2.17127099e-01,
        -3.40869635e-02,  3.90000000e+01]),
 array([-9.95128932e-01,  4.00000000e+00,  3.51674526e-01,  1.00000000e+00,
        -1.19725891e+00,  4.00000000e+00,  7.00000000e+00,  3.00000000e+00,
         2.00000000e+00,  1.00000000e+00, -1.44803531e-01, -2.17127099e-01,
        -3.40869635e-02,  3.90000000e+01]))

In [9]:
model = AutoEncoder(
    input_dim = 14,
    encoder_hidden_dims = [128, 64, 32],
    decoder_hidden_dims = [64, 64],
    cate_indices = [1, 3, 5, 6, 7, 8, 9, 13], 
    cardinalities = preprocessor.label_encoder.cardinalities,
    cate_embedding_dim = 4,
    dropout_rate = 0.1,
    activation = 'mish',
    is_bias = False,
    block_type = 'mlp'
)

In [10]:
model

AutoEncoder(
  (embedding): EmbeddingEncoder(
    (embedding_layers): ModuleList(
      (0): Embedding(9, 4)
      (1): Embedding(16, 4)
      (2): Embedding(7, 4)
      (3): Embedding(15, 4)
      (4): Embedding(6, 4)
      (5): Embedding(5, 4)
      (6): Embedding(2, 4)
      (7): Embedding(42, 4)
    )
  )
  (encoder): Sequential(
    (0): LinearBlock(
      (linear): Linear(in_features=38, out_features=128, bias=False)
      (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (activation): Mish()
    )
    (1): LinearBlock(
      (linear): Linear(in_features=128, out_features=64, bias=False)
      (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (activation): Mish()
    )
    (2): LinearBlock(
      (linear): Linear(in_features=64, out_features=32, bias=False)
      (bn): BatchNorm1d(32, eps=1e-05, momentum

In [11]:
import torch
from torch.utils.data import DataLoader

In [12]:
loader = DataLoader(train_dataset, batch_size=512)

In [13]:
for dd in loader:
    X_tensor, _ = dd
    break



In [17]:
enc, dec = model(X_tensor)

emb :  torch.Size([512, 38])


In [15]:
 preprocessor.label_encoder.cardinalities

[9, 16, 7, 15, 6, 5, 2, 42]

In [18]:
enc.size()

torch.Size([512, 32])

In [20]:
dec[0].size()

torch.Size([512, 1])

In [21]:
dec[1].size()

torch.Size([512, 9])