In [1]:
import pandas as pd
from pytorch_lightning import Trainer

from model.loader import CensusEncoder
from model.modules import Decoder, Encoder
from model.vae import VAE

In [2]:
census = pd.read_csv("~/Data/sample.csv")
census = census.drop("resident_id_m", axis=1)


def hash(line: pd.Series) -> str:
    return "".join([str(l) for l in line])


hashed = list(census.apply(hash))
hashset = set(hashed)
p = len(hashset) / len(hashed)
print(len(census))
print(f"Probability of unique person = {p:.3}")

census.head()

604351
Probability of unique person = 1.0


Unnamed: 0,approx_social_grade,country_of_birth_3a,economic_activity_status_10m,ethnic_group_tb_6a,health_in_general,hh_families_type_6a,hours_per_week_worked,in_full_time_education,industry_10a,iol22cd,legal_partnership_status_6a,occupation_10a,region,religion_tb,residence_type,resident_age_7d,sex,usual_short_student
0,4,1,1,4,1,4,4,2,4,-8,1,5,E12000003,2,1,4,2,1
1,-8,1,5,4,2,1,-8,2,7,-8,1,2,E12000005,2,1,7,2,1
2,-8,2,5,4,2,1,-8,2,7,-8,1,3,E12000002,2,1,7,2,1
3,-8,1,5,4,3,2,-8,2,8,-8,2,8,E12000006,2,1,7,1,1
4,4,1,8,4,2,1,-8,2,9,-8,1,9,E12000002,1,1,2,2,1


In [3]:
census_encoder = CensusEncoder(data=census)
census_encoder.names()

['approx_social_grade',
 'country_of_birth_3a',
 'economic_activity_status_10m',
 'ethnic_group_tb_6a',
 'health_in_general',
 'hh_families_type_6a',
 'hours_per_week_worked',
 'in_full_time_education',
 'industry_10a',
 'iol22cd',
 'legal_partnership_status_6a',
 'occupation_10a',
 'region',
 'religion_tb',
 'residence_type',
 'resident_age_7d',
 'sex',
 'usual_short_student']

In [4]:
dataloader = census_encoder.encode(data=census)

torch.Size([604351, 18])


In [5]:
encoder = Encoder(
    encodings=census_encoder.encodings(),
    embed_size=32,
    hidden_n=2,
    hidden_size=64,
    latent_size=8,
)
decoder = Decoder(
    encodings=census_encoder.encodings(),
    embed_size=32,
    hidden_n=2,
    hidden_size=64,
    latent_size=8,
)
vae = VAE(
    names=census_encoder.names(),
    encodings=census_encoder.encodings(),
    encoder=encoder,
    decoder=decoder,
    beta=0.001,
    lr=0.001,
)

In [6]:
trainer = Trainer(limit_train_batches=100, max_epochs=1)
trainer.fit(model=vae, train_dataloaders=dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/fred/miniforge3/envs/mirror/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default

  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | encoder | Encoder | 10.7 K | train
1 | decoder | Decoder | 6.8 K  | train
--------------------------------------------
17.6 K    Trainable params
0         Non-trainable params
17.6 K    Total params
0.070     Total estimated model params size (MB)
34  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

torch.Size([1024, 18])


RuntimeError: expected scalar type Long but found Int