In [1]:
import pandas as pd
import os
import torch
from typing import Callable, Optional, Type, Union, List
import sklearn.model_selection
import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
    ModelCheckpoint,
    StochasticWeightAveraging,
    GradientAccumulationScheduler,
    LearningRateMonitor,
)


# a = pd.read_parquet('/home/toomuch/kaggle-diffusion/vectors/embeddings-__-vit-h-14-laion2B-s32B-b79K-__-MiniLM.parquet')

In [2]:

class Dataset(torch.utils.data.DataLoader):
    def __init__(self, df):
        self.minilm_embeddings = list(df['MiniLM-emb'])
        self.clip_embeddings = list(df['CLIP-emb'])

    def __len__(self):
        return len(self.clip_embeddings)
    
    def __getitem__(self, idx):
        clip_emb = torch.tensor(self.clip_embeddings[idx], dtype=torch.float32)
        minilm_emb = torch.tensor(self.minilm_embeddings[idx], dtype=torch.float32)
        return clip_emb, minilm_emb


class DataModule(pl.LightningDataModule):
    def __init__(self, parquet_path: str, batch_size: int, test_size: float):
        super().__init__()
        self.parquet_path = parquet_path
        self.batch_size = batch_size
        self.test_size = test_size

    def prepare_data(self):
        self.df = pd.read_parquet(self.parquet_path)

    def setup(self, stage: Optional[str] = None):
        train_df, val_df = sklearn.model_selection.train_test_split(
            self.df, test_size=self.test_size
        )
        self.train_dataset = Dataset(train_df)
        self.val_dataset = Dataset(val_df)

    def train_dataloader(self) -> torch.utils.data.DataLoader:
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4,
        )

    def val_dataloader(self) -> torch.utils.data.DataLoader:
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=4,
        )

In [3]:
import pytorch_lightning as pl
import pandas as pd
from typing import Optional
import torch
import numpy as np
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import get_cosine_schedule_with_warmup


class Model(pl.LightningModule):
    def __init__(self, lr, warmup_ratio, total_optimization_steps):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.total_optimization_steps = total_optimization_steps
        self.warmup_ratio = warmup_ratio
        self.backbone = torch.nn.Sequential(
            torch.nn.Linear(in_features=1024, out_features=384),
        )
        self.criterion = torch.nn.MSELoss()

    def forward(self, x):
        out = self.backbone(x)
        return out

    def training_step(self, batch, batch_idx):
        domain_from, domain_to = batch
        domain_to_hat = self(domain_from)
        loss = self.criterion(domain_to_hat, domain_to)

        self.log("train_loss", loss, on_step=True, on_epoch=False, prog_bar=True)
        logs = {"train_loss": loss.detach().cpu().numpy()}
        return {"loss": loss, "log": logs,}

    def validation_step(self, batch, batch_idx):
        domain_from, domain_to = batch
        domain_to_hat = self(domain_from)
        loss = self.criterion(domain_to_hat, domain_to)
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=False)
        return {"val_loss": loss,}

    # def validation_epoch_end(self, outputs):
    #     avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    #     print(
    #         "\n\nVAL Loss: {}\n".format(
    #             avg_loss,
    #         )
    #     )

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=0)
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_ratio * self.total_optimization_steps,
            num_training_steps=self.total_optimization_steps,
        )
        return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}]


2023-04-08 10:51:45.616985: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
RUN_NAME = '-Debug'
dm = DataModule(
    parquet_path="/home/toomuch/kaggle-diffusion/vectors/embeddings-__-vit-h-14-laion2B-s32B-b79K-__-MiniLM.parquet",
    # parquet_path="/home/toomuch/kaggle-diffusion/a.parquet",
    batch_size=8192,
    test_size=0.2,
)
model = Model(
    lr=1e-4, warmup_ratio=0.05, total_optimization_steps=3_000
)
logger = TensorBoardLogger("lightning_logs", name="SDP" + RUN_NAME)
checkpoint_callback = ModelCheckpoint(
    save_top_k=2,
    monitor="val_loss",
    mode="min",
    dirpath="runs/{}/".format(RUN_NAME),
    filename=RUN_NAME + "-{epoch:02d}-{val_loss:.4f}",
    save_last=False,
    verbose=True,
    every_n_epochs=1,
)
lr_monitor = LearningRateMonitor(logging_interval="step")
trainer = pl.Trainer(
    accelerator='cuda',
    devices=[2],
    precision=32,
    callbacks=[checkpoint_callback, lr_monitor],  # , swa_callback],
    max_epochs=1000,
    check_val_every_n_epoch=1,
    logger=logger,
    log_every_n_steps=1,
)

trainer.fit(model, dm)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name      | Type       | Params
-----------------------------------------
0 | backbone  | Sequential | 393 K 
1 | criterion | MSELoss    | 0     
-----------------------------------------
393 K     Trainable params
0         Non-trainable params
393 K     Total params
1.574     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 178: 'val_loss' reached 0.02225 (best 0.02225), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=00-val_loss=0.0222.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 356: 'val_loss' reached 0.00783 (best 0.00783), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=01-val_loss=0.0078.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 534: 'val_loss' reached 0.00399 (best 0.00399), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=02-val_loss=0.0040.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 712: 'val_loss' reached 0.00255 (best 0.00255), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=03-val_loss=0.0026.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 890: 'val_loss' reached 0.00191 (best 0.00191), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=04-val_loss=0.0019.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 1068: 'val_loss' reached 0.00160 (best 0.00160), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=05-val_loss=0.0016.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 1246: 'val_loss' reached 0.00143 (best 0.00143), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=06-val_loss=0.0014.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 1424: 'val_loss' reached 0.00133 (best 0.00133), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=07-val_loss=0.0013.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 1602: 'val_loss' reached 0.00127 (best 0.00127), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=08-val_loss=0.0013.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 1780: 'val_loss' reached 0.00124 (best 0.00124), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=09-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 10, global step 1958: 'val_loss' reached 0.00121 (best 0.00121), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=10-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 11, global step 2136: 'val_loss' reached 0.00120 (best 0.00120), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=11-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 12, global step 2314: 'val_loss' reached 0.00119 (best 0.00119), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=12-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 13, global step 2492: 'val_loss' reached 0.00119 (best 0.00119), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=13-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 14, global step 2670: 'val_loss' reached 0.00118 (best 0.00118), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=14-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 15, global step 2848: 'val_loss' reached 0.00118 (best 0.00118), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=15-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 16, global step 3026: 'val_loss' reached 0.00118 (best 0.00118), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=16-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 17, global step 3204: 'val_loss' reached 0.00118 (best 0.00118), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=17-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 18, global step 3382: 'val_loss' reached 0.00118 (best 0.00118), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=18-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 19, global step 3560: 'val_loss' reached 0.00118 (best 0.00118), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=19-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 20, global step 3738: 'val_loss' reached 0.00117 (best 0.00117), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=20-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 21, global step 3916: 'val_loss' reached 0.00116 (best 0.00116), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=21-val_loss=0.0012.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 22, global step 4094: 'val_loss' reached 0.00115 (best 0.00115), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=22-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 23, global step 4272: 'val_loss' reached 0.00114 (best 0.00114), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=23-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 24, global step 4450: 'val_loss' reached 0.00113 (best 0.00113), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=24-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 25, global step 4628: 'val_loss' reached 0.00112 (best 0.00112), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=25-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 26, global step 4806: 'val_loss' reached 0.00112 (best 0.00112), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=26-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 27, global step 4984: 'val_loss' reached 0.00112 (best 0.00112), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=27-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 28, global step 5162: 'val_loss' reached 0.00112 (best 0.00112), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=28-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 29, global step 5340: 'val_loss' reached 0.00112 (best 0.00112), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=29-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 30, global step 5518: 'val_loss' reached 0.00111 (best 0.00111), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=30-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 31, global step 5696: 'val_loss' reached 0.00111 (best 0.00111), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=31-val_loss=0.0011.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

Epoch 32, global step 5874: 'val_loss' reached 0.00111 (best 0.00111), saving model to '/home/toomuch/kaggle-diffusion/clip-to-minnilm/runs/-Debug/-Debug-epoch=32-val_loss=0.0011.ckpt' as top 2
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
