In [5]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR

from torch.utils.data import DataLoader
from BookDataset import getBookDataset, BookDataset
from embeddings import EmbeddingSummation

In [6]:
train, val, test, featureSizes = getBookDataset('../Data_Train.xlsx', '../Data_Test.xlsx')

In [7]:
class BookPrice(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        outDims = lambda x: int(x**.25) + 1
        self.outDims = outDims
        
        # embeddings for categorical variables: Author, Genre, BookCategory
        self.AuthorEmbedding = nn.Embedding(config.Author,  outDims(config.Author))
        self.GenreEmbedding  = nn.Embedding(config.Genre,  outDims(config.Genre))
        self.BookCategoryEmbedding  = nn.Embedding(config.BookCategory,  outDims(config.BookCategory))
        categoricalUnits = 5 * outDims(config.Author) + outDims(config.Genre) + outDims(config.BookCategory)
        # ------
        
        
        # embeddings for text features Title, Synopsis
        self.TitleEmbedding    = EmbeddingSummation()
        self.SynopsisEmbedding = EmbeddingSummation()
        textUnits = 128 * 2
        #-------
        
        # Ratings, Reviews and Edition
        numericUnits = 3
        #-------

        total = categoricalUnits + textUnits + numericUnits
        
        self.Dense = nn.Sequential(
            nn.Linear(total, 64), nn.LayerNorm(64), nn.Tanh(),
            nn.Linear(64, 32), nn.LayerNorm(32), nn.Tanh(), 
            nn.Linear(32, 1)
        )

    def forward(self, batch:dict) -> torch.Tensor:
        authors = torch.stack([ batch[f'author_{i}'] for i in range(5) ]).T
        authDim = 5 * self.outDims(self.config.Author)
        categoricals = [
            self.AuthorEmbedding(authors).reshape(-1, authDim),
            self.GenreEmbedding(batch['Genre']),
            self.BookCategoryEmbedding(batch['BookCategory'])
        ]
        
        text = [
            self.TitleEmbedding(batch['Title']),
            self.SynopsisEmbedding(batch['Synopsis'])
        ]
        
        numericals  = [torch.stack([
            batch['Ratings'],
            batch['Reviews'],
            batch['Edition']
        ]).T]
        
        inputs = torch.cat(categoricals + text + numericals, dim=-1)
        
        logits = self.Dense(inputs)
        return logits
    
    
    def training_step(self, batch:dict, batchIdx:int)->torch.Tensor:
        logits = self(batch)

        loss = F.mse_loss(logits, batch['Price'].reshape(-1, 1))
        self.log('loss', loss)

        return loss

    def validation_step(self, batch:dict, batchIdx:int):
        logits = self(batch)
        loss = F.mse_loss(logits, batch['Price'].reshape(-1, 1))
        
        score = 1 - loss.pow(.5)
        self.log('val_score', score)
        return score

    def configure_optimizers(self):
        optimizer = Adam(self.parameters(), lr=1e-3)
        scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 0.99 * epoch)
        return [optimizer], [scheduler]

    
model = BookPrice(featureSizes)
# loader = DataLoader(BookDataset(train), batch_size=8)
# batch = next(iter(loader))
# with torch.no_grad():
#     print(model.training_step(batch, 0))
x = sum(p.numel() for p in model.parameters())
print(f'BookPrice model has {x:,} parameters')

BookPrice model has 7,836,458 parameters


In [None]:
stopping = EarlyStopping(monitor='val_score', patience=10, mode='max')
ckpt = ModelCheckpoint(dirpath='chekpoints', filename='{epoch}-{val_score:.5f}',
                       monitor='val_score', mode='max',
                      save_weights_only=True)

trainer = pl.Trainer(gpus = 1, callbacks=[stopping, ckpt])

loaders = []
for i, data in enumerate([train, val, test]):
    shuffle = i==0
    loaders.append( DataLoader(BookDataset(data), batch_size=32, shuffle=shuffle ) )

trainer.fit(model, loaders[0], loaders[1])

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                  | Type               | Params
-------------------------------------------------------------
0 | AuthorEmbedding       | Embedding          | 2.6 K 
1 | GenreEmbedding        | Embedding          | 804   
2 | BookCategoryEmbedding | Embedding          | 24    
3 | TitleEmbedding        | EmbeddingSummation | 3.9 M 
4 | SynopsisEmbedding     | EmbeddingSummation | 3.9 M 
5 | Dense                 | Sequential         | 20.9 K
-------------------------------------------------------------
7.8 M     Trainable params
0         Non-trainable params
7.8 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

In [5]:
trainer.test(model, loaders[2])



1

In [6]:
results = []
model = model.eval()
with torch.no_grad():
    for batch in testLoader:
        for key in batch:
            batch[key] = batch[key].to(model.device)
        preds = model.test_step(batch, 0)
        results.append(preds)
results = torch.cat(results).cpu().numpy()

NameError: name 'testLoader' is not defined

In [None]:
pd.DataFrame({
    'Price': np.exp(results) - 1
}).to_excel('mysubmission_tanh.xlsx', index=False)