In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR

from torch.utils.data import DataLoader
from BookDataset import getBookDataset, BookDataset
from embeddings import EmbeddingSummation

from torch.utils.tensorboard import SummaryWriter

In [2]:
train, val, test, featureSizes = getBookDataset('../Data_Train.xlsx', '../Data_Test.xlsx', returnValidation=False)

trainLoader = BookDataset(train).loader(16)
testLoader  = BookDataset(test).loader(16, shuffle=False)
if val:
    valLoader = BookDataset(val).loader(16, shuffle=False)
    
print('Train: ', train.shape)
print('Test:  ', test.shape)

Train:  (6237, 13)
Test:   (1560, 12)


In [3]:
class BookPrice(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        outDims = lambda x: int(x**.25) + 1

        # embeddings for categorical variables: Author, Genre, BookCategory
        self.AuthorEmbedding = nn.Embedding(config.Author,  outDims(config.Author))
        self.GenreEmbedding  = nn.Embedding(config.Genre,  outDims(config.Genre))
        self.BookCategoryEmbedding  = nn.Embedding(config.BookCategory,  outDims(config.BookCategory))
        categoricalUnits = 5 * outDims(config.Author) + outDims(config.Genre) + outDims(config.BookCategory)
        # ------
        
        
        # embeddings for text features Title, Synopsis
        self.TitleEmbedding    = EmbeddingSummation()
        self.SynopsisEmbedding = EmbeddingSummation()
        textUnits = 128 * 2
        #-------
        
        # Ratings, Reviews and Edition
        numericUnits = 3
        #-------

        total = categoricalUnits + textUnits + numericUnits
        
        self.Dense = nn.Sequential(
            nn.Linear(total, 64), nn.LayerNorm(64), nn.Tanh(), nn.Dropout(.2),
            nn.Linear(64, 32), nn.LayerNorm(32), nn.Tanh(), nn.Dropout(.2),
            nn.Linear(32, 1)
        )

    def forward(self, batch:dict) -> torch.Tensor:
        """
        Author          torch.int64     [b, 5]
        Genre           torch.int64     [b]
        BookCategory    torch.int64     [b]
        Numeric         torch.float32   [b, 3]
        TitleInput      torch.int64     [b, 37]
        TitleMask       torch.int64     [b, 37]
        SynopsisInput   torch.int64     [b, 346]
        SynopsisMask    torch.int64     [b, 346]
        Price           torch.float32   [b]
        """
        authDim = 5 * self.AuthorEmbedding.embedding_dim
        inputs = torch.cat([
            # categoricals
            self.AuthorEmbedding(batch['Author']).reshape(-1, authDim),
            self.GenreEmbedding(batch['Genre']),
            self.BookCategoryEmbedding(batch['BookCategory']),
            
            # text as embeddings
            self.TitleEmbedding(batch['TitleInput'], batch['TitleMask']),
            self.SynopsisEmbedding(batch['SynopsisInput'], batch['SynopsisMask']),
            
            # Numeric
            batch['Numeric']
        ], dim=-1)
        
        logits = self.Dense(inputs)
        return logits
    
    
    def training_step(self, batch:dict, batchIdx:int)->torch.Tensor:
        logits = self(batch)

        loss = F.mse_loss(logits, batch['Price'].reshape(-1, 1))
        self.log('loss', loss)
    
        return loss

    def training_epoch_end(self, outputs):
        loss  = torch.Tensor([ output['loss'] for output in outputs ]).mean()
        score = 1 - loss.pow(.5)
        self.log('score', score)
    
    def validation_step(self, batch:dict, batchIdx:int):
        logits = self(batch)
        loss = F.mse_loss(logits, batch['Price'].reshape(-1, 1))
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=1e-3)
        scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 0.99 * epoch)
        return [optimizer], [scheduler]

    
model = BookPrice(featureSizes)
x = sum(p.numel() for p in model.parameters())
print(f'BookPrice model has {x:,} parameters')

BookPrice model has 7,836,741 parameters


In [4]:
writer = SummaryWriter('logs/model_graph')
batch = next(iter(trainLoader))
writer.add_graph(model, batch);
writer.close()

  position_ids = self.position_ids[:, :seq_length]


In [5]:
tbl = pl.loggers.TensorBoardLogger('logs/train')

ckpt_metric = 'val_loss'
save_mode = 'min'
b = 32

if val is None:
    ckpt_metric = 'loss'

stopping = EarlyStopping(monitor=ckpt_metric, patience=10, mode=save_mode)
ckpt = ModelCheckpoint(dirpath='checkpoints', filename='{epoch}-{'+ckpt_metric+':.5f}',
                       monitor=ckpt_metric, mode=save_mode,
                      save_weights_only=True, verbose=True)

In [10]:
trainer = pl.Trainer(gpus = 1, callbacks=[stopping, ckpt], logger=tbl)

if val is not None:
    valLoader = DataLoader(BookDataset(val), batch_size=16, shuffle=False)
    trainer.fit(model, trainLoader, valLoader)
else:
    trainer.fit(model, trainLoader);


  | Name                  | Type               | Params
-------------------------------------------------------------
0 | AuthorEmbedding       | Embedding          | 2.9 K 
1 | GenreEmbedding        | Embedding          | 832   
2 | BookCategoryEmbedding | Embedding          | 24    
3 | TitleEmbedding        | EmbeddingSummation | 3.9 M 
4 | SynopsisEmbedding     | EmbeddingSummation | 3.9 M 
5 | Dense                 | Sequential         | 20.9 K
-------------------------------------------------------------
7.8 M     Trainable params
0         Non-trainable params
7.8 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

1

In [11]:
print("Saving model from best ckpt", ckpt.best_model_path)
model = BookPrice.load_from_checkpoint(ckpt.best_model_path, config=featureSizes).eval().cuda()

results = []
with torch.no_grad():
    for batch in testLoader:
        for key in batch:
            if key not in 'Title Synopsis'.split():
                batch[key] = batch[key].to(model.device)
        preds = model(batch)
        results.append(preds)
results = torch.cat(results).cpu().numpy().reshape(-1)

submission = pd.DataFrame({
    'Price': np.exp(results) - 1
})
print("Saving submission ", submission.shape)
submission.to_excel('submission.xlsx', index=False)

Saving model from best ckpt C:\Users\Deepak H R\Desktop\data\BookPrice\albert\checkpoints\epoch=24-loss=0.05248.ckpt
Saving submission  (1560, 1)
