In [1]:
%%capture
!pip install -U optuna pytorch_lightning==1.8.5.post0 rouge-score transformers sentencepiece pandas

In [2]:
import pandas as pd
import numpy as np
#.replace(to_replace="-1", value=np.nan)
df = pd.read_parquet('../datasets/economicos/synth/split/train.parquet').replace(to_replace="None", value=np.nan).replace(to_replace=-1, value=np.nan)
display(df.shape)

CHAR_SEP = " "
df.sample(3)


(22059, 17)

Unnamed: 0,url,description,price,property_type,transaction_type,state,county,publication_date,rooms,bathrooms,m_built,m_size,source,title,address,owner,_price
521501,https://www.economicos.cl/propiedades/departam...,Gran departamento a la venta en el Plan de Vin...,$ 230.000.000,Departamento,Venta,Valparaíso,Viña del Mar,2022-02-23 00:00:19,3.0,2.0,150.0,300.0,El Mercurio,Departamento en Venta en Viña del Mar 3 dormit...,"Plan de Viña del mar Viña del Mar, Valparaíso",Mi llave,7308.062513
587134,https://www.economicos.cl/propiedades/local-o-...,"Casa/Local Comercial, Chillán Valor: 7500 UF ...",7.500 UF,Local o Casa comercial,Venta,Biobío,Chillán,2022-02-23 08:07:00,3.0,5.0,315.0,327.0,El Mercurio,Local o Casa comercial en Venta en Chillán 3 d...,"Propiedad comercial en Venta Chillán Chillán, ...",Agente 365,7500.0
591505,https://www.economicos.cl/propiedades/departam...,DEPTO. MEJOR BARRIO DE VITACURA - 3 DORMITORIO...,9.720 UF,Departamento,Venta,Metropolitana de Santiago,Vitacura,2022-02-23 00:00:28,3.0,2.0,100.0,115.0,El Mercurio,Departamento en Venta en Vitacura 3 dormitorio...,"Calle Navidad / Las Nieves Vitacura, Metropoli...",Nexxos,9720.0


In [3]:
def convert(row):
    return {
        "text": ["Describe", f"""{row.publication_date.strftime('%Y-%m-%d')}
precio {row.price}
tipo {row.property_type}
transacción {row.transaction_type}
región {row.state}
comuna {row.county}
dormitorios {row.rooms}
baños {row.rooms}
constuidos {row.m_built}
terreno {row.m_size}
precio_real {row._price}""".replace("\n", " ")],
        "target": row.description
        }

print(
    df.sample(1).apply(convert, axis=1).iloc[-1]
)

{'text': ['Describe', '2022-07-21 precio 4.680 UF tipo Departamento transacción Venta región Metropolitana de Santiago comuna Providencia dormitorios 1.0 baños 1.0 constuidos 40.0 terreno 49.0 precio_real 4680.0'], 'target': 'Se Vende Departamento 1 Dorm. Diego de Almagro 2163. Providencia. Metro Inés de Suárez IMPECABLE, IDEAL INVERSIONISTAS! Incluye ESTACIONAMIENTO Y BODEGA.  - Descripción Departamento: Año 2016 / Constructora Pebal  - 40 m2 útiles - 49 m2 totales, gran terraza - Amplio Dormitorio Principal en suite con closet. - Cocina encimera, mesón granito. - Conexión para lavadora. - Piso 3 de 7. - Orientación Oriente.  Gastos comunes $100.000 aprox. Contribuciones $48.000.-  * Descripción áreas comunes: - Sala multiuso, Gimnasio, Lavandería, Estacionamiento de visitas, Conserjería 24/7  VENDE Y ASESORA ARCO GESTION INMOBILIARIA Agente encargado Carol Mancilla 9.48586901'}


In [4]:
df_text = pd.DataFrame(df.apply(convert, axis=1).to_list())
df_text.sample(3)

Unnamed: 0,text,target
16121,"[Describe, 2022-07-21 precio 120 UF tipo Depar...",Departamento en condominio con espectacular á...
6947,"[Describe, 2019-11-22 precio 11.190 UF tipo Ca...","Casa de dos pisos en barrio tranquilo, con ori..."
9752,"[Describe, 2022-02-23 precio 19.999 UF tipo De...",Exclusivo condominio. Precioso departamento do...


In [8]:
import argparse
from argparse import ArgumentParser
from os.path import join, isfile
from os import listdir
import optuna
from optuna.integration import PyTorchLightningPruningCallback
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from rouge_score import rouge_scorer
import shutil
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import  DataLoader, RandomSampler, SequentialSampler #Dataset,
from transformers import get_linear_schedule_with_warmup, AdamW
# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [9]:
class MetricsCallback(pl.Callback):
    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

In [10]:
CHAR_SEP = " "
MAX_SRC_LEN = 150
MAX_TGT_LEN = 720
from functools import partial

class T5Finetuner(pl.LightningModule):

    def __init__(self, args, df, batch_size=8):
        super().__init__()
        self.save_hyperparameters()
        self.args = args
        self.model = T5ForConditionalGeneration.from_pretrained(self.args.model)
        self.tokenizer = T5Tokenizer.from_pretrained(self.args.model)
        self.data = df
        #self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.batch_size = batch_size
        self.decode = partial(self.tokenizer.decode, skip_special_tokens=True)

    def _encode_text(self, text_input, target):
      ctext = str(text_input)
      ctext = CHAR_SEP.join(ctext.split())
      target = str(target) #summarized text
      target = CHAR_SEP.join(target.split())
      source = self.tokenizer.batch_encode_plus([ctext], 
                                                max_length= MAX_SRC_LEN, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
      target = self.tokenizer.batch_encode_plus([target], 
                                                max_length=MAX_TGT_LEN,
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
      y = target['input_ids']
      target_id = y[:, :-1].contiguous()
      target_label = y[:, 1:].clone().detach()
      target_label[y[:, 1:] == self.tokenizer.pad_token_id] = -100 #in case the labels are not provided, empty string
      return source['input_ids'], source['attention_mask'], target_id, target_label
    
    def encode_text(self, text, target):
        source = self.tokenizer.batch_encode_plus([text], 
                                                max_length= MAX_SRC_LEN, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target], 
                                                max_length=MAX_TGT_LEN,
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        y = target['input_ids']
        target_id = y[:, :-1].contiguous()
        target_label = y[:, 1:].clone().detach()
        target_label[y[:, 1:] == self.tokenizer.pad_token_id] = -100 #in case the labels are not provided, empty string
        return source['input_ids'], source['attention_mask'], target_id, target_label

    
    def prepare_df(self, df):
        source_ids, source_masks, target_ids, target_labels = [], [], [], [] 
        for _, row in df.iterrows():
            source_id, source_mask, target_id, target_label = self.encode_text(row.text, row.target)
            source_ids.append(source_id)
            source_masks.append(source_mask)
            target_ids.append(target_id)
            target_labels.append(target_label)

        # Convert the lists into tensors
        source_ids = torch.cat(source_ids, dim=0)
        source_masks = torch.cat(source_masks, dim=0)
        target_ids = torch.cat(target_ids, dim=0)
        target_labels = torch.cat(target_labels, dim=0)
        # splitting the data to train, validation, and test
        return TensorDataset(source_ids, source_masks, target_ids, target_labels)
    
    def prepare_data(self):
        # splitting the data to train, validation, and test
        data = self.prepare_df(self.data)
        train_size, val_size = int(0.8 * len(data)), int(0.1 * len(data))
        test_size = len(data) - (train_size + val_size)
        self.train_dat, self.val_dat, self.test_dat = \
            random_split(data, [train_size, val_size, test_size])
    
    def forward(self, batch, batch_idx):
        source_ids, source_mask, target_ids, target_labels = batch[:4]
        return self.model(input_ids = source_ids, attention_mask = source_mask, 
                          decoder_input_ids=target_ids, labels=target_labels)
        
    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss, 'log': {'train_loss': loss}}
    
    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        return self(batch, batch_idx)
        return " ".join(map(self.decode, y_hat))

    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss, 'val_loss': loss}

    def validation_epoch_end(self, outputs):
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'val_loss': loss}
        return {**out, 'log': out}

    def test_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss}

    def test_epoch_end(self, outputs):
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'test_loss': loss}
        return {**out, 'log': out}
    
    def train_dataloader(self):
        return DataLoader(self.train_dat, batch_size=self.batch_size,
                          num_workers=4, sampler=RandomSampler(self.train_dat))

    def val_dataloader(self):
        return DataLoader(self.val_dat, batch_size=self.args.bs, num_workers=4,
                          sampler=SequentialSampler(self.val_dat))

    def test_dataloader(self):
        return DataLoader(self.test_dat, batch_size=self.args.bs, num_workers=4,
                          sampler=SequentialSampler(self.test_dat))    

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.args.lr, eps=1e-4)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0,
            num_training_steps=self.args.max_epochs * len(self.train_dat))
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}
    
    def generate_summary(self, ctext, summ_len=150, text='', beam_search=2, repetition_penalty=2.5):
        source_id, source_mask, target_id, target_label = self.encode_text(ctext, text)
        self.model.eval()
        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids = source_id,
                attention_mask = source_mask, 
                max_length=summ_len, 
                truncation=True,
                num_beams=beam_search,
                repetition_penalty=repetition_penalty, 
                length_penalty=1.0, 
                early_stopping=True
                )
            prediction = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        if len(text) > 0:
            target = [self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in target_id]
            scores = self.scorer.score(target[0], prediction[0])
            return prediction, scores
        else:
            return prediction
        

    def save_core_model(self):
        store_path = join(self.args.output, self.args.name, 'core')
        self.model.save_pretrained(store_path)
        self.tokenizer.save_pretrained(store_path)
        
    @staticmethod
    def add_model_specific_args(parent_parser):
        p = ArgumentParser(parents=[parent_parser], add_help=False)
        p.add_argument('-m', '--model', type=str, default='t5-base',
                       help='name of the model or the path pointing to it')
        p.add_argument('--bs', '--batch_size', type=int, default=2)
        p.add_argument('--source_len', type=int, default=120)
        p.add_argument('--summ_len', type=int, default=700)
        return p

In [11]:
def default_args():
    p = ArgumentParser()
    p.add_argument('-p', '--path', type=str,  
                   default='/content/gdrive/My Drive/Colab Notebooks/data/text_summarization_t5/news_summary.csv',
                  help='path to the data file')
    p.add_argument('-o', '--output', type=str, default='/tmp/tpu-template',
                  help='path to the output directory for storing the model')
    p.add_argument('-n', '--name', type=str, default='google/t5-v1_1-xxl',
                  help='this name will be used on tensorboard for the model')
    p.add_argument('-t', '--trials', type=int, default=1,
                  help='number of trials for hyperparameter search')
    p.add_argument('--seed', type=int, default=0, help='randomization seed')
    p = T5Finetuner.add_model_specific_args(p)
    p = pl.Trainer.add_argparse_args(p)
    args,_ = p.parse_known_args()
    args.max_epochs = 2
    return args

def default_args():
    p = ArgumentParser()
    args,_ = p.parse_known_args()
    args.max_epochs = 2
    #args.model = "google/flan-t5-small"
    #args.model = "google/flan-t5-xl"
    args.model = "google/flan-t5-large"
    #args.model = "google/flan-t5-base"
    args.output = f"./{args.model.replace('/','_')}"
    args.name = "DESCRIPCION_PROPIEDADES"
    args.bs = 1 # batch size
    return args

def optuna_objective(trial):
    args = default_args()
    # sampling the hyperparameters
    args.lr = trial.suggest_categorical("lr", [1e-6, 5e-6, 1e-5, 5e-5, 1e-4])
    # setting up the right callbacks
    cp_callback = pl.callbacks.ModelCheckpoint(
        join(args.output, args.name, f"trial_{trial.number}", "{epoch}"),
        monitor="val_loss", mode="min")
    pr_callback = PyTorchLightningPruningCallback(trial, monitor="val_loss")
    metrics_callback = MetricsCallback()
    summarizer = T5Finetuner(args, df_text)         # loading the model
    trainer = pl.Trainer.from_argparse_args(      # loading the trainer
        args, 
        accelerator="gpu",
        devices=1,
        default_root_dir=args.output, gradient_clip_val=1.0,
        #checkpoint_callback=cp_callback,
        callbacks=[metrics_callback],
        #early_stop_callback=pr_callback, 
        num_sanity_val_steps=-1,
        #auto_scale_batch_size="power",
        # select TensorBoad or Wandb logger
        logger=TensorBoardLogger(join(args.output, 'logs'), name=args.name, version=f'trial_{trial.number}')
        )
  
    trainer.fit(summarizer)                       # fitting the model
    trainer.test(summarizer)                      # testing the model
    return min([x['val_loss'].item() for x in metrics_callback.metrics])

In [12]:
#NEW_MODEL="/home/gvillarroel/dev/synthetic-data-for-text/notebooks/google_flan-t5-base/logs/DESCRIPCION_PROPIEDADES/trial_0/checkpoints/epoch=1-step=436696.ckpt"
NEW_MODEL = "/home/gvillarroel/dev/synthetic-data-for-text/notebooks/google_flan-t5-large/logs/DESCRIPCION_PROPIEDADES/trial_0/checkpoints/epoch=1-step=873392.ckpt"
new_model = T5Finetuner.load_from_checkpoint(NEW_MODEL)



In [23]:
from textwrap import wrap
from numpy import random

new_model.model.eval()
new_model.model.cuda()
#record = df_text.sample(1).iloc[0]
def gen_new_text(record):
    #encoded = new_model.tokenizer.batch_encode_plus(record.text, max_length= MAX_SRC_LEN, truncation=False,padding=False,return_tensors='pt')
    encoded = new_model.tokenizer.batch_encode_plus(record.text, 
                                                max_length= 120, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
    #encoded = new_model.tokenizer("<sep>".join(record.text), return_tensors="pt")
    
    max_length= random.randint(200, 600)
    min_length = random.randint(20, max_length-5)
    min_length=60
    max_length=100
    input_ids =encoded['input_ids'].cuda() 
    att = encoded['attention_mask'].cuda()
    y_hat = new_model.model.generate(
        inputs= input_ids,
        attention_mask=att,
        num_beams=1,
        min_length=min_length,
        max_length=max_length,
        repetition_penalty=5.5,
        length_penalty=0.1,
        early_stopping=True,
        temperature=random.random(),
        use_cache=False,
        top_p=random.random(),
        #top_k=0,
        do_sample=True
    )
    del input_ids
    del att
    return " ".join([new_model.tokenizer.decode(gen_id, skip_special_tokens=True) for gen_id in y_hat])

display(" ".join(record.text))
display(gen_new_text(record))
display(record.target)

'Describe 2022-07-21 precio 22.000 UF tipo Casa transacción Venta región Araucanía comuna Temuco dormitorios 5.0 baños 5.0 constuidos 360.0 terreno 5000.0 precio_real 22000.0'

'- Especialistas in art, culture y music. ----- Ubicados: Ibarra 910 Oficina A-614 Concepción; Fono (09) 88217303/ +56984723607 Mail (jdistribucionist@gmail.com); mosa casa en Condominio Valle de la Araucana, Temuco. Casa con 5 dormitorios 3 baos Sala estar Living comedor separados Piscina Quincho Terraza Bodegas Estupendamente cuidadizada Ubicada al norte del condomino La propiedad consta: Primer piso Hall acceso living Comedin Comodo dormitor'

'Se vende casa en Lomas Del Carmen, Temuco. Se vende casa 360m2 construidos, 5000m2 terreno en exclusivo Condominio Lomas del Carmen en Temuco. Propiedad estilo chilena con influencia colonia alemana rural. 2 niveles más un altillo para uso recreacional, 5 dormitorios, 5 baños, más casa de huéspedes de 82 m2. Lujosa, amplia y cómoda casa para disfrutar en familia. Condominio con seguridad las 24hrs del día.'

In [11]:
#import swifter
data_g  = df_text.head(1000).apply(gen_new_text, axis=1)
df_new_text = df_text.head(1000).assign(generated=data_g)
df_new_text.to_parquet("m2.parquet")

In [None]:
import polars as prs
df_text.to_parquet("text.parquet")
df_p_text = prs.read_parquet("text.parquet")

In [None]:
def gen_new_text_p(record):
    #encoded = new_model.tokenizer.batch_encode_plus(record.text, max_length= MAX_SRC_LEN, truncation=False,padding=False,return_tensors='pt')
    encoded = new_model.tokenizer.batch_encode_plus(record[0], 
                                                max_length= 120, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
    #encoded = new_model.tokenizer("<sep>".join(record.text), return_tensors="pt")
    
    max_length= random.randint(200, 600)
    min_length = random.randint(20, max_length-5)
    input_ids =encoded['input_ids'].cuda() 
    att = encoded['attention_mask'].cuda()
    y_hat = new_model.model.generate(
        inputs= input_ids,
        attention_mask=att,
        num_beams=1,
        min_length=min_length,
        max_length=max_length,
        repetition_penalty=5.5,
        length_penalty=0.1,
        early_stopping=True,
        temperature=random.random(),
        use_cache=False,
        top_p=random.random(),
        top_k=0,
        do_sample=True
    )
    del input_ids
    del att
    return " ".join([new_model.tokenizer.decode(gen_id, skip_special_tokens=True) for gen_id in y_hat])
#df_out = df_p_text.head(1000).apply(gen_new_text_p)