In [6]:
import boto3
import pandas as pd
import yaml
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import constants as c
import numpy as np
import os
import json
import time
import pyarrow.parquet as pq
import pyarrow.fs
import io

from tqdm import tqdm
from datetime import datetime
from torch.utils.data import DataLoader, TensorDataset

with open('secrets.yml', 'r') as f:
    secrets = yaml.safe_load(f)

s3_fs = pyarrow.fs.S3FileSystem(region='us-east-1', 
                                access_key=secrets['aws_access_key_id'], 
                                secret_key=secrets['aws_secret_access_key'])


s3 = boto3.client('s3', aws_access_key_id=secrets['aws_access_key_id'], aws_secret_access_key=secrets['aws_secret_access_key'])


In [7]:

# Read into memory all files in bucket c.S3_BUCKET with prefix c.LC_VPATH 
# which are parquet files
read_s3_pq = lambda path: pq.ParquetDataset(path, filesystem=s3_fs).read_pandas().to_pandas()
file_urls = [f"{c.S3_BUCKET}/{d['Key']}" for d in s3.list_objects(Bucket=c.S3_BUCKET, Prefix=c.LC_VPATH)['Contents']]

dfs = map(read_s3_pq, file_urls)

# Concatenate all the dataframes into one
df = pd.DataFrame()

for increment_df in tqdm(dfs):
    df = pd.concat([df, increment_df])

print(f"Loaded {len(df)} records with {len(df.columns)} columns")

# Make sure all classes are represented
n_classes = len(c.LC_LEGEND)
n_sampled_classes = len(set([x.split('_')[2] for x in df.columns]))

assert n_classes == n_sampled_classes, f"Expected {n_classes} classes, but only found {n_sampled_classes} classes in the data"
print(f"Found {n_sampled_classes} classes in the data")

1it [00:00,  1.11it/s]

Loaded 10000 records with 161 columns
Found 23 classes in the data





### Toggle binary columns on or off

In [8]:
use_binary = False

if not use_binary:
    cols_kept = [c for c in df.columns if 'binary' not in c]
    df = df[cols_kept]
    
print(df.shape)

(10000, 161)


In [9]:
from layers import ResidualBlock

class ResNetAutoencoder(nn.Module):
    def __init__(self, D, hidden_units, num_layers, z_dim, is_softmax=None, n_classes=None):
        super(ResNetAutoencoder, self).__init__()

        self.is_softmax = is_softmax
        self.n_classes = n_classes

        assert num_layers >= 2, "Number of layers should be at least 2 for an autoencoder."

        # Encoder
        layers = [ResidualBlock(D, hidden_units)]
        for _ in range(num_layers - 2):  # -2 because first and last layers are manually added
            layers.append(ResidualBlock(hidden_units, hidden_units))
        layers.append(ResidualBlock(hidden_units, z_dim))
        self.encoder = nn.Sequential(*layers)

        # Decoder
        layers = [ResidualBlock(z_dim, hidden_units)]
        for _ in range(num_layers - 2):
            layers.append(ResidualBlock(hidden_units, hidden_units))
        layers.append(ResidualBlock(hidden_units, D))
        self.decoder = nn.Sequential(*layers)

    def forward(self, x):
        z = self.encoder(x)
        logits = self.decoder(z)

        if not isinstance(self.is_softmax, type(None)):
    
            # (n, d) from (n, d) to (n, k, d/k) where k is the
            # number of and d/k is the number of classes, assumed
            # to be the same for each group
            reshaped = logits[:, self.is_softmax].reshape(-1, self.n_classes)

            probs_grouped = torch.softmax(reshaped, dim=1)
            # Flatten along last axis
            probs_grouped = probs_grouped.reshape(-1, self.is_softmax.sum())
            probs_ungrouped = torch.sigmoid(logits[:, ~self.is_softmax])
            probs = torch.concatenate((probs_grouped, probs_ungrouped), dim=1)
        else:
            probs = torch.sigmoid(logits)
            
        return probs

class ConvResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, downsample=False):
        super(ConvResidualBlock, self).__init__()
        
        stride = 2 if downsample else 1
        
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = x
        
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        out += self.shortcut(identity)
        out = F.relu(out)

        return out


class ConvAutoencoder(nn.Module):
    def __init__(self, in_channels, hidden_channels, z_channels, 
                 K, linear_dims, downsample_blocks, non_downsample_blocks, H, W):
        super(ConvAutoencoder, self).__init__()

        # Encoder
        encoder_layers = []

        # Interleaving blocks
        current_channels = in_channels
        for i in range(max(downsample_blocks, non_downsample_blocks)):
            if i < downsample_blocks:
                encoder_layers.append(ConvResidualBlock(current_channels, hidden_channels, downsample=True))
                current_channels = hidden_channels
            
            if i < non_downsample_blocks:
                encoder_layers.append(ConvResidualBlock(current_channels, hidden_channels))

        self.encoder_conv = nn.Sequential(*encoder_layers)
        
        # Bottleneck linear layers
        self.encoder_linear = nn.Sequential(
            nn.Flatten(),
            nn.Linear(current_channels * H * W, z_channels),
            nn.ReLU(),
            nn.Linear(z_channels, current_channels * H * W),
            nn.Unflatten(1, (current_channels, H, W))
        )

        # Decoder
        decoder_layers = []
        
        for i in range(max(downsample_blocks, non_downsample_blocks)):
            if i < downsample_blocks:
                decoder_layers.append(nn.Upsample(scale_factor=2, mode='nearest'))
                decoder_layers.append(ConvResidualBlock(current_channels, hidden_channels))
            
            if i < non_downsample_blocks:
                decoder_layers.append(ConvResidualBlock(current_channels, hidden_channels))

        # Final convolution layer to produce a single channel with depth K
        decoder_layers.append(nn.Conv2d(hidden_channels, K, kernel_size=1, stride=1, padding=0))

        self.decoder = nn.Sequential(*decoder_layers)

    def forward(self, x):
        x = self.encoder_conv(x)
        x = self.encoder_linear(x)
        x = self.decoder(x)
        
        # Reshape to apply softmax across the depth of K to get the required categorical representation.
        b, c, h, w = x.size()
        x = x.view(b, c, -1)
        x = F.softmax(x, dim=1)
        x = x.view(b, c, h, w)

        return x
        
def custom_loss(recon_x, x, weights):
    # Binary Cross Entropy Loss
    BCE = F.binary_cross_entropy(recon_x, x, reduction='none')  # no reduction to get per-element loss
    weighted_BCE = BCE * weights
    return weighted_BCE.mean()

def train(model, dataloader, optimizer, device, weights):
    model.train()
    total_loss = 0
    for x in dataloader:
        x = x[0].float().to(device)
        optimizer.zero_grad()
        recon_x = model(x)

        loss = custom_loss(recon_x, x, weights)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

_, D = df.shape

hidden_units = 32
num_layers = 2
z_dim = 8

n_rings    = len(c.LC_K_RING_SETS)
is_softmax = np.zeros(D, dtype=bool)
is_softmax[0:c.LC_N_CLASSES * n_rings] = True

model = ResNetAutoencoder(D, hidden_units, num_layers, z_dim, is_softmax=is_softmax, n_classes=n_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

# Train the model
tensor = torch.tensor(df.values, dtype=torch.float32)

weights = 1.0 
weights /= (tensor.mean(dim=0) + 1e-8) # adding a small constant to avoid division by zero

# Create a TensorDataset from tensor
dataset = TensorDataset(tensor)

# Define a DataLoader
batch_size = 64
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
num_epochs    = 50

train_time_start = time.time()
for epoch in range(num_epochs):
    loss = train(model, train_dataloader, optimizer, device, weights)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}')

train_time_end = time.time()
total_train_time = train_time_end - train_time_start

model_str = str(model)

x_eval    = df.iloc[0:1000].values
x_hat     = model.forward(torch.Tensor(x_eval)).detach().numpy()
data_mean = df.mean(axis=0).values

l1_err = lambda x_hat, x_true: np.abs(x_hat - x_true).mean()
model_error, benchmark_error = l1_err(x_hat, x_eval), l1_err(data_mean, x_eval)

metadata_record = {
    'num_epochs'     : num_epochs,
    'batch_size'     : batch_size,
    'hidden_units'   : hidden_units,
    'num_layers'     : num_layers,
    'z_dim'          : z_dim,
    'final_loss'     : loss,
    'model_str'      : model_str,
    'num_params'     : sum(p.numel() for p in model.parameters() if p.requires_grad),
    'train_time'     : total_train_time,
    'model_error'    : model_error,
    'benchmark_error': benchmark_error

}
# Write to logs/ folder
current_date_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
model_filename = f'model-resnet-ae-{current_date_str}.json'

# Write model to file, then upload to s3 at c.MODELS_DIR
buffer = io.BytesIO()
torch.save(model, buffer)
s3.put_object(Bucket=c.S3_BUCKET, Key=f'models/{model_filename}', Body=buffer.getvalue())
log_path = os.path.join('logs', f'log_{current_date_str}.json')

with open(log_path, 'w') as f:
    json.dump(metadata_record, f)

# Upload to s3
s3.upload_file(log_path, c.S3_BUCKET, f'logs/{os.path.basename(log_path)}')
os.remove(log_path)


Epoch 1/50, Loss: 7.3414
Epoch 2/50, Loss: 3.5506
Epoch 3/50, Loss: 2.8014
Epoch 4/50, Loss: 2.5816
Epoch 5/50, Loss: 2.4511
Epoch 6/50, Loss: 2.3622
Epoch 7/50, Loss: 2.2789
Epoch 8/50, Loss: 2.2370
Epoch 9/50, Loss: 2.1928
Epoch 10/50, Loss: 2.1651
Epoch 11/50, Loss: 2.1429
Epoch 12/50, Loss: 2.1102
Epoch 13/50, Loss: 2.1022
Epoch 14/50, Loss: 2.0789
Epoch 15/50, Loss: 2.0479
Epoch 16/50, Loss: 2.0445
Epoch 17/50, Loss: 2.0193
Epoch 18/50, Loss: 2.0102
Epoch 19/50, Loss: 1.9911
Epoch 20/50, Loss: 1.9925
Epoch 21/50, Loss: 1.9790
Epoch 22/50, Loss: 1.9695
Epoch 23/50, Loss: 1.9659
Epoch 24/50, Loss: 1.9603
Epoch 25/50, Loss: 1.9512
Epoch 26/50, Loss: 1.9411
Epoch 27/50, Loss: 1.9365
Epoch 28/50, Loss: 1.9312
Epoch 29/50, Loss: 1.9286
Epoch 30/50, Loss: 1.9286
Epoch 31/50, Loss: 1.9229
Epoch 32/50, Loss: 1.9111
Epoch 33/50, Loss: 1.9114
Epoch 34/50, Loss: 1.9090
Epoch 35/50, Loss: 1.8994
Epoch 36/50, Loss: 1.8991
Epoch 37/50, Loss: 1.8918
Epoch 38/50, Loss: 1.8929
Epoch 39/50, Loss: 1.

# Reconstruction visuals

In [10]:
fig, axes = plt.subplots(1,3, figsize=(14, 5))

plt.sca(axes[0])
plt.imshow(torch.Tensor(df.iloc[0:200].values), vmin=0, vmax=1)
plt.colorbar()
plt.axis('off')
plt.title("Ground truth")

plt.sca(axes[1])
plt.imshow(x_hat, vmin=0, vmax=1)
plt.colorbar()
plt.axis('off')
plt.title("Reconstructed")

plt.sca(axes[2])
plt.imshow(np.abs(x_hat-x_eval), vmin=0, vmax=1)
plt.colorbar()
plt.axis('off')
plt.title("L1 error")

plt.tight_layout()
plt.savefig(os.path.join(c.FIGURES_DIR, 'autoencoder-reconstruction.png'), dpi=500)
plt.close()