In [1]:
%%capture
!pip install optuna pytorch_lightning rouge-score transformers sentencepiece

In [5]:
import pandas as pd
import numpy as np
#.replace(to_replace="-1", value=np.nan)
df = pd.read_parquet('../datasets/economicos/synthb/split/train.parquet')
#.replace(to_replace="None", value=np.nan).replace(to_replace=-1, value=np.nan)
display(df.shape)
df.sample(3)
CHAR_SEP = " "


(545870, 17)

In [6]:
def convert(row):
    return {
        "text": [
            f"""<fecha, {(pd.Timestamp('2017-12-01') +  pd.DateOffset(int(row.publication_date or 0))).strftime('%Y-%m-%d')}>
<precio, {row.price}>
<tipo, {row.property_type}>
<transacción, {row.transaction_type}>
<región, {row.state}>
<comuna, {row.county}>
<dormitorios, {row.rooms or -1}>
<baños, {row.rooms or -1}>
<construidos, {row.m_built or -1}>
<terreno, {row.m_size or -1}>
<precio_real, {row._price}>
<titulo, {row.title}>
<dirección, {row.address}>""".replace("\n", " "),
"descripción de esta publicación"],

"target": row.description
        }

display(
    df.sample(1).apply(convert, axis=1).iloc[-1]
)

{'text': ['<fecha, 2020-09-23> <precio, 15500 UF> <tipo, Departamento> <transacción, Venta> <región, Metropolitana de Santiago> <comuna, Providencia> <dormitorios, 2.0> <baños, 2.0> <construidos, 260.0> <terreno, 5000.0> <precio_real, 15500.0> <titulo, PENTHOUSE DÚPLEX 260 mt2 LOTA / LUIS THAYER OJEDA> <dirección, LUIS THAYER OJEDA 615 Providencia, Metropolitana de Santiago>',
  'descripción de esta publicación'],
 'target': 'Última y Gran Oportunidad. Espectacular y Único Penthouse Dúplex recién remodelado 100% todo nuevo sin uso.\nMaravilloso departamento pisos 14 y 15, la mejor ubicación de Providencia, orientación: nororiente / norte / poniente y sur. Privilegiada vista panorámica asegurada a todo Santiago, rodeado de áreas verdes y arboles añosos, recién remodelado 100% (instalación eléctrica nueva 40 amperes, instalación sanitaria, gas y agua, ventanas con termo paneles winko, pisos: porcelanatos, cerámicas y maderas, pintura de muros realizada por un artista, foto 

In [4]:
df_text = pd.DataFrame(df.apply(convert, axis=1).to_list())
df_text.sample(3)

Unnamed: 0,text,target
190693,"[<fecha, 2018-11-25> <precio, $ 100.000> <tipo...","un dormitorio con closets, comedor,baño con c..."
403286,"[<fecha, 2019-03-07> <precio, $> <tipo, Casa> ...","Vendo casa Quinta Normal, Salvador Gutiérrez ..."
428363,"[<fecha, 2020-01-02> <precio, $ 290.000> <tipo...","Arriendo departamento de dos dormitorios, coci..."


In [7]:
import argparse
from argparse import ArgumentParser
from os.path import join, isfile
from os import listdir
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from rouge_score import rouge_scorer
import shutil
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import  DataLoader, RandomSampler, SequentialSampler #Dataset,
from transformers import get_linear_schedule_with_warmup, AdamW
# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [7]:
tkn = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl")
def t2l(row):
    text = row.text
    target = row.target

    return {"text": len(tkn(
        text
        )["input_ids"]),
        "target": len(tkn(
        target
        )["input_ids"])}
#txttgt = pd.DataFrame(df.apply(convert, axis=1).to_list())
#d = pd.DataFrame(txttgt).apply(t2l, axis=1).to_list()
#print(pd.DataFrame(d).quantile(0.1))
#print(pd.DataFrame(d).quantile(0.96))
## in: 12:36
## out: 12:528

In [8]:
import random
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)
pl.seed_everything(42)

  return torch._C._cuda_getDeviceCount() > 0
Global seed set to 42


42

In [9]:
class MetricsCallback(pl.Callback):
    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

In [11]:
CHAR_SEP = " "
MAX_SRC_LEN = 200
MAX_TGT_LEN = 720
class T5Finetuner(pl.LightningModule):

    def __init__(self, args, df, batch_size=8):
        super().__init__()
        self.save_hyperparameters()
        self.args = args
        self.model = T5ForConditionalGeneration.from_pretrained(self.args.model)
        self.tokenizer = T5Tokenizer.from_pretrained(self.args.model)
        self.data = df
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.batch_size = batch_size

    def encode_text(self, text, target):
        source = self.tokenizer.batch_encode_plus(["<SEP>".join(text)], 
                                                max_length= MAX_SRC_LEN, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target], 
                                                max_length=MAX_TGT_LEN,
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        y = target['input_ids']
        target_id = y[:, :-1].contiguous()
        target_label = y[:, 1:].clone().detach()
        target_label[y[:, 1:] == self.tokenizer.pad_token_id] = -100 #in case the labels are not provided, empty string
        return source['input_ids'], source['attention_mask'], target_id, target_label

        
    
    def prepare_data(self):
        source_ids, source_masks, target_ids, target_labels = [], [], [], [] 
        for _, row in self.data.iterrows():
            source_id, source_mask, target_id, target_label = self.encode_text(row.text, row.target)
            source_ids.append(source_id)
            source_masks.append(source_mask)
            target_ids.append(target_id)
            target_labels.append(target_label)

        # Convert the lists into tensors
        source_ids = torch.cat(source_ids, dim=0)
        source_masks = torch.cat(source_masks, dim=0)
        target_ids = torch.cat(target_ids, dim=0)
        target_labels = torch.cat(target_labels, dim=0)
        # splitting the data to train, validation, and test
        data = TensorDataset(source_ids, source_masks, target_ids, target_labels)
        train_size, val_size = int(0.8 * len(data)), int(0.1 * len(data))
        test_size = len(data) - (train_size + val_size)
        self.train_dat, self.val_dat, self.test_dat = \
            random_split(data, [train_size, val_size, test_size])
    
    def forward(self, batch, batch_idx):
        source_ids, source_mask, target_ids, target_labels = batch[:4]
        return self.model(input_ids = source_ids, attention_mask = source_mask, 
                          decoder_input_ids=target_ids, labels=target_labels)
        
    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss, 'val_loss': loss}

    def validation_epoch_end(self, outputs):
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'val_loss': loss}
        return {**out, 'log': out}

    def test_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss}

    def test_epoch_end(self, outputs):
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'test_loss': loss}
        return {**out, 'log': out}
    
    def train_dataloader(self):
        return DataLoader(self.train_dat, batch_size=self.batch_size,
                          num_workers=4, sampler=RandomSampler(self.train_dat))

    def val_dataloader(self):
        return DataLoader(self.val_dat, batch_size=self.args.bs, num_workers=4,
                          sampler=SequentialSampler(self.val_dat))

    def test_dataloader(self):
        return DataLoader(self.test_dat, batch_size=self.args.bs, num_workers=4,
                          sampler=SequentialSampler(self.test_dat))    

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.args.lr, eps=1e-4)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0,
            num_training_steps=self.args.max_epochs * len(self.train_dat))
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}
    
    def generate_summary(self, ctext, summ_len=150, text='', beam_search=2, repetition_penalty=2.5):
        source_id, source_mask, target_id, target_label = self.encode_text(ctext, text)
        self.model.eval()
        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids = source_id,
                attention_mask = source_mask, 
                max_length=summ_len, 
                truncation=True,
                num_beams=beam_search,
                repetition_penalty=repetition_penalty, 
                length_penalty=1.0, 
                early_stopping=True
                )
            prediction = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        if len(text) > 0:
            target = [self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in target_id]
            scores = self.scorer.score(target[0], prediction[0])
            return prediction, scores
        else:
            return prediction
        

    def save_core_model(self):
        store_path = join(self.args.output, self.args.name, 'core')
        self.model.save_pretrained(store_path)
        self.tokenizer.save_pretrained(store_path)
        
    @staticmethod
    def add_model_specific_args(parent_parser):
        p = ArgumentParser(parents=[parent_parser], add_help=False)
        p.add_argument('-m', '--model', type=str, default='t5-base',
                       help='name of the model or the path pointing to it')
        p.add_argument('--bs', '--batch_size', type=int, default=2)
        p.add_argument('--source_len', type=int, default=120)
        p.add_argument('--summ_len', type=int, default=700)
        return p

In [12]:
def default_args():
    p = ArgumentParser()
    args,_ = p.parse_known_args()
    args.max_epochs = 50
    args.model = "google/mt5-base"
    args.output = f"./B-{args.model.replace('/','_')}"
    args.name = "DESCRIPCION_PROPIEDADES"
    args.bs = 1 # batch size
    return args

In [13]:
args = default_args()
# sampling the hyperparameters
args.lr = 2e-5
# setting up the right callbacks
cp_callback = pl.callbacks.ModelCheckpoint(
    join(args.output, args.name, f"trial_0", "{epoch}"),
    monitor="val_loss", mode="min")
metrics_callback = MetricsCallback()
summarizer = T5Finetuner(args, df_text)         # loading the model
trainer = pl.Trainer.from_argparse_args(      # loading the trainer
    args, 
    accelerator="gpu",
    devices=1,
    default_root_dir=args.output, gradient_clip_val=1.0,
    callbacks=[metrics_callback, cp_callback],
    num_sanity_val_steps=-1,
    logger=TensorBoardLogger(join(args.output, 'logs'), name=args.name, version=f'trial_0')
    )

trainer.fit(summarizer)                       # fitting the model
trainer.test(summarizer)                      # testing the model

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [15]:
NEW_MODEL="/home/gvillarroel/dev/synthetic-data-for-text/notebooks/google_flan-t5-small/logs/DESCRIPCION_PROPIEDADES/trial_0/checkpoints/epoch=1-step=87340.ckpt"
new_model = T5Finetuner.load_from_checkpoint(NEW_MODEL)



In [43]:
encoded = new_model.tokenizer.batch_encode_plus([df_text.iloc[200].text], 
                                                max_length= MAX_SRC_LEN, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')

In [44]:
encoded

{'input_ids': tensor([[    3, 30010,     1,   846, 18083, 10106,     3,   102,  7886,    32,
          1514,  2226,    32,  9882,   736,    32,  4605,    26,     9,  3017,
             9,    75, 12765,  1533,  3483,    26,    32,  5925,    23, 15742,
          3144, 29461,  6252,     9,     3,    29,   152, 28349,    23,    32,
             7,     3,    29,   152,  4698,     2,    32,     7,     3,    29,
           152,  6900,    17,    76, 28594,     3,    29,   152, 10225,    29,
            32,     3,    29,   152,     3,   102,  7886,    32,   834,  6644,
             3, 11739,    10,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [47]:
new_model.eval()
y_hat = new_model.model.generate(
    inputs=encoded['input_ids'],
    attention_mask=encoded['attention_mask'],
    num_beams=1,
    max_length=500,
    repetition_penalty=2.5,
    length_penalty=1.0,
    early_stopping=True,
    use_cache=True
)


In [50]:
" ".join([new_model.tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for gen_id in y_hat])

'arriendo casa amoblada, 2 dormitoriOS. 998283509.'

In [41]:
new_model.tokenizer.decode(y_hat.logits.squeeze(), skip_special_tokens=True, clean_up_tokenization_spaces=True)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'

In [21]:
a, b, c, d  = new_model.encode_text(
        ['Describe',
 '2018-04-15 precio $ tipo Casa Amoblada transacción Arriendo región Los Lagos comuna nan dormitorios nan baños nan constuidos nan terreno nan precio_real 0.0:'], ""
    )