In [1]:
import sys
import glob

sys.path.insert(0, '/home/eharper/github_projects/evenoldridge/BatchDataloader')

In [2]:
from batch_dataloader import BatchDataLoader
from batch_dataset import BatchDataset
from batch_dataset import TensorBatchDataset

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import torch
import torch.nn as nn
import torch.optim as optim

In [3]:
PARQUET_DIR = '/home/eharper/yagr_projects/pytorch-dataloading/train_parquet'

INPUT_DIM = 7
LATENT_DIM = 3

In [4]:
parquet_filepaths = glob.glob(PARQUET_DIR + '/*.parquet')

In [5]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        
        self.encoder = nn.Sequential(
            nn.Linear(self.input_dim, 96),
            nn.Tanh(),
            nn.Dropout(.1),
            nn.Linear(96, 64),
            nn.Tanh(),
            nn.Dropout(.1),
            nn.Linear(64, 48),
            nn.Tanh(),
            nn.Dropout(.1),
            nn.Linear(48, 16),
            nn.Tanh(),
            nn.Dropout(.1),
            nn.Linear(16, self.latent_dim),
            nn.ReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(self.latent_dim, 16),
            nn.Tanh(),
            nn.Dropout(.1),
            nn.Linear(16, 48),
            nn.Tanh(),
            nn.Dropout(.1),
            nn.Linear(48, 64),
            nn.Tanh(),
            nn.Dropout(.1),
            nn.Linear(64, 96),
            nn.Tanh(),
            nn.Dropout(.1),
            nn.Linear(96, self.input_dim),
            nn.ReLU()
        )
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [6]:
autoencoder = Autoencoder(INPUT_DIM, LATENT_DIM)

In [12]:
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

In [13]:
num_epochs = 2

for epoch in range(num_epochs):
    
    for parquet_idx, parquet_path in enumerate(parquet_filepaths):
        
        parquet = pq.read_table(parquet_path)

        tensors = [torch.from_numpy(parquet.to_pandas().values)]

        tensor_batch_dataset = TensorBatchDataset(tensors, batch_size=20)

        batch_loader = BatchDataLoader(tensor_batch_dataset,
                                       shuffle=True,
                                       drop_last=True)

        for batch_idx, batch in enumerate(batch_loader):
            batch = batch[0]
            #print(batch)
            reconstructions = autoencoder(batch)
            loss = criterion(reconstructions, batch)
            loss.backward()
            optimizer.step()
            
            print('Epoch %d | Parquet %d | Batch %d | Loss %.4f' %
                  (epoch, parquet_idx, batch_idx, loss.item())
                 )

Epoch 0 | Parquet 0 | Batch 0 | Loss 0.2980
Epoch 0 | Parquet 0 | Batch 1 | Loss 0.2564
Epoch 0 | Parquet 1 | Batch 0 | Loss 0.2426
Epoch 0 | Parquet 1 | Batch 1 | Loss 0.2347
Epoch 0 | Parquet 2 | Batch 0 | Loss 0.2148
Epoch 0 | Parquet 2 | Batch 1 | Loss 0.2127
Epoch 1 | Parquet 0 | Batch 0 | Loss 0.2042
Epoch 1 | Parquet 0 | Batch 1 | Loss 0.1934
Epoch 1 | Parquet 1 | Batch 0 | Loss 0.1886
Epoch 1 | Parquet 1 | Batch 1 | Loss 0.1767
Epoch 1 | Parquet 2 | Batch 0 | Loss 0.1907
Epoch 1 | Parquet 2 | Batch 1 | Loss 0.1782
