In [49]:
import torch
import torchvision

In [50]:
import os
from tqdm import tqdm

# Dataset

In [51]:
from torch.utils.data import Dataset, DataLoader
import json


class ShabbyPagesDataset(Dataset):

    def __init__(self, split="train"):
        super().__init__()

        self.folder_shabby = f"{split}/{split}/{split}_shabby/"
        self.folder_clean = f"{split}/{split}/{split}_cleaned/"

        self.image_dict = {}
        for idx, image_name in enumerate(os.listdir(self.folder_shabby)):
            self.image_dict[idx] = image_name

        with open(f"{split}_image_dict.json", "w") as f:
            json.dump(self.image_dict, f)

    def __len__(self):
        return len(self.image_dict)

    def __getitem__(self, idx):
        input_sample = torchvision.io.read_image(
            os.path.join(self.folder_shabby, self.image_dict[idx])
        ).to(dtype=torch.float32)
        target_sample = torchvision.io.read_image(
            os.path.join(self.folder_clean, self.image_dict[idx])
        ).to(dtype=torch.float32)
        return input_sample, target_sample

In [52]:
train_dataset = ShabbyPagesDataset(split='train')
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=8
)

test_dataset = ShabbyPagesDataset(split='train')
test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=8
)


validate_dataset = ShabbyPagesDataset(split='train')
validate_dataloader = DataLoader(
    dataset=validate_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=8
)


# deep learning model

In [53]:
from typing import Any
from torch.nn import (
    Conv2d,
    ConvTranspose2d,
    MaxPool2d,
    MaxUnpool2d,
    BatchNorm2d,
    MSELoss,
    ReLU,
)

import lightning


class DenoisingNet(lightning.LightningModule):
    def __init__(self):
        super().__init__()

        self.encoder = torch.nn.Sequential(
            BatchNorm2d(num_features=1),
            Conv2d(in_channels=1, out_channels=8, kernel_size=5),
            BatchNorm2d(num_features=8),
            ReLU(),
            # MaxPool2d(2, 2),
            Conv2d(in_channels=8, out_channels=16, kernel_size=5),
            BatchNorm2d(num_features=16),
            ReLU(),
            # MaxPool2d(2, 2),
            # Conv2d(in_channels=16, out_channels=16, kernel_size=5),
            # BatchNorm2d(num_features=16),
            # ReLU(),
            # MaxPool2d(2, 2, return_indices=True),
        )

        self.decoder = torch.nn.Sequential(
            # MaxUnpool2d(2, 2),
            # ConvTranspose2d(in_channels=16, out_channels=16, kernel_size=5),
            # BatchNorm2d(num_features=16),
            # ReLU(),
            # MaxUnpool2d(2, 2),
            ConvTranspose2d(in_channels=16, out_channels=8, kernel_size=5),
            BatchNorm2d(num_features=8),
            ReLU(),
            # MaxUnpool2d(2, 2),
            ConvTranspose2d(in_channels=8, out_channels=1, kernel_size=5),
            BatchNorm2d(num_features=1),
            ReLU(),
        )

        self.mse_loss = MSELoss()

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        x, y = batch
        x = self.encoder(x)
        x = self.decoder(x)
        loss = self.mse_loss(x, y)
        self.log("mse_train", loss, prog_bar=True, on_epoch=True, on_step=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = self.encoder(x)
        x = self.decoder(x)
        loss = self.mse_loss(x, y)
        self.log("mse_val", loss, prog_bar=True, on_epoch=True, on_step=False)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [1]:
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import WandbLogger


early_stopping = EarlyStopping(
    # monitor='valid_mse',
    monitor='mse_val',
    patience=25,
    mode='min'
)

checkpoint_callback = ModelCheckpoint(
    dirpath='saved_models/', 
    filename='best_model',
    monitor='mse_val',
    mode='min'
)

wandb_logger = WandbLogger(project="kaggle-time-series")


trainer = lightning.Trainer(
    max_epochs=250, 
    # accelerator=device, 
    precision="16-mixed", 
    logger=wandb_logger,
    callbacks=[
        early_stopping,
        checkpoint_callback
    ]
)

ModuleNotFoundError: DistributionNotFound: The 'wandb>=0.12.10' distribution was not found and is required by the application. HINT: Try running `pip install -U 'wandb>=0.12.10'`

In [None]:
# model
model = DenoisingNet()

# train model

trainer.fit(model=model, train_dataloaders=train_dataloader, val_dataloaders=validate_dataloader)