In [1]:
%%capture
!pip install -U optuna pytorch_lightning==1.8.5.post0 rouge-score transformers sentencepiece pandas

In [2]:
import pandas as pd
import numpy as np
#.replace(to_replace="-1", value=np.nan)
df = pd.read_parquet('../datasets/economicos/synth/split/train.parquet').replace(to_replace="None", value=np.nan).replace(to_replace=-1, value=np.nan)
display(df.shape)

CHAR_SEP = " "
df.sample(3)


(545870, 17)

Unnamed: 0,url,description,price,property_type,transaction_type,state,county,publication_date,rooms,bathrooms,m_built,m_size,source,title,address,owner,_price
193185,https://www.economicos.cl/propiedades/vendo-de...,"Vendo depto. nuevo, piso 9 y último piso con ...",9500 UF,Departamento,Venta,Metropolitana de Santiago,Providencia,2019-03-27 21:44:03,3.0,2.0,8304.0,9586.0,,Vendo depto. nuevo 3 dormitorios 9.500 UF,"Silvina Hurtado / Antonio Varas Providencia, M...",-1,9500.0
291391,https://www.economicos.cl/propiedades/departam...,420.000 3 dormitorios acceso controlado 99435...,$ 420.000,Departamento,Arriendo,Metropolitana de Santiago,Maipú,2018-10-31 00:00:27,3.0,,,,El Mercurio,Departamento en Arriendo en Maipú 3 dormitorios,"Maipú, Metropolitana de Santiago",-1,15.310531
743799,https://www.economicos.cl/propiedades/departam...,Se arrienda hermoso y amplio departamento en M...,$ 380.000,Departamento,Arriendo,Metropolitana de Santiago,Macul,2020-11-19 12:25:08,2.0,2.0,,,,Departamento a pasos del metro Las Torres,"Macul, Metropolitana de Santiago",Katherine Torres,13.123355


In [3]:
def convert(row):
    return {
        "text": ["Describe", f"""{row.publication_date.strftime('%Y-%m-%d')}
precio {row.price}
tipo {row.property_type}
transacción {row.transaction_type}
región {row.state}
comuna {row.county}
dormitorios {row.rooms}
baños {row.rooms}
constuidos {row.m_built}
terreno {row.m_size}
precio_real {row._price}""".replace("\n", " ")],
        "target": row.description
        }

print(
    df.sample(1).apply(convert, axis=1).iloc[-1]
)

{'text': ['Describe', '2018-06-15 precio $ 366.000.000 tipo Casa transacción Venta región Metropolitana de Santiago comuna Providencia dormitorios 5.0 baños 5.0 constuidos 210.0 terreno 275.0 precio_real 13496.511247776923'], 'target': 'Amplia Casa, cerca de la Estación Metro Baquedano. \nHERMOSA CASA SEÑORIAL.\nHall de entrada, amplio living separado de amplio comedor, 4 dormitorios, 2 baños, cocina amplia, pieza y baño de empleada, patio, garage para un auto.\nRevestimiento piso: madera.\nMetros útiles: 210 m2.\nMetros terreno: 275 m2.\nCerca de supermercados, bancos, comercios, colegios, etc.\nValor: $ 366.000.000.-\n'}


In [4]:
df_text = pd.DataFrame(df.apply(convert, axis=1).to_list())
df_text.sample(3)

Unnamed: 0,text,target
8851,"[Describe, 2022-02-23 precio $ 299.040.000 tip...","Se vende casa en condominio excelente estado, ..."
328228,"[Describe, 2019-03-19 precio $ 58.000.000 tipo...","Arriendo departamento en Quilpué, sector Paso..."
212342,"[Describe, 2018-02-23 precio 6300 UF tipo Depa...",Se vende departamento hermosa vista al mar. 3 ...


In [5]:
import argparse
from argparse import ArgumentParser
from os.path import join, isfile
from os import listdir
import optuna
from optuna.integration import PyTorchLightningPruningCallback
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from rouge_score import rouge_scorer
import shutil
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import  DataLoader, RandomSampler, SequentialSampler #Dataset,
from transformers import get_linear_schedule_with_warmup, AdamW
# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [6]:
class MetricsCallback(pl.Callback):
    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

In [7]:
CHAR_SEP = " "
MAX_SRC_LEN = 150
MAX_TGT_LEN = 720
from functools import partial

class T5Finetuner(pl.LightningModule):

    def __init__(self, args, df, batch_size=8):
        super().__init__()
        self.save_hyperparameters()
        self.args = args
        self.model = T5ForConditionalGeneration.from_pretrained(self.args.model)
        self.tokenizer = T5Tokenizer.from_pretrained(self.args.model)
        self.data = df
        #self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.batch_size = batch_size
        self.decode = partial(self.tokenizer.decode, skip_special_tokens=True)

    def _encode_text(self, text_input, target):
      ctext = str(text_input)
      ctext = CHAR_SEP.join(ctext.split())
      target = str(target) #summarized text
      target = CHAR_SEP.join(target.split())
      source = self.tokenizer.batch_encode_plus([ctext], 
                                                max_length= MAX_SRC_LEN, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
      target = self.tokenizer.batch_encode_plus([target], 
                                                max_length=MAX_TGT_LEN,
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
      y = target['input_ids']
      target_id = y[:, :-1].contiguous()
      target_label = y[:, 1:].clone().detach()
      target_label[y[:, 1:] == self.tokenizer.pad_token_id] = -100 #in case the labels are not provided, empty string
      return source['input_ids'], source['attention_mask'], target_id, target_label
    
    def encode_text(self, text, target):
        source = self.tokenizer.batch_encode_plus([text], 
                                                max_length= MAX_SRC_LEN, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([target], 
                                                max_length=MAX_TGT_LEN,
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
        y = target['input_ids']
        target_id = y[:, :-1].contiguous()
        target_label = y[:, 1:].clone().detach()
        target_label[y[:, 1:] == self.tokenizer.pad_token_id] = -100 #in case the labels are not provided, empty string
        return source['input_ids'], source['attention_mask'], target_id, target_label

    
    def prepare_df(self, df):
        source_ids, source_masks, target_ids, target_labels = [], [], [], [] 
        for _, row in df.iterrows():
            source_id, source_mask, target_id, target_label = self.encode_text(row.text, row.target)
            source_ids.append(source_id)
            source_masks.append(source_mask)
            target_ids.append(target_id)
            target_labels.append(target_label)

        # Convert the lists into tensors
        source_ids = torch.cat(source_ids, dim=0)
        source_masks = torch.cat(source_masks, dim=0)
        target_ids = torch.cat(target_ids, dim=0)
        target_labels = torch.cat(target_labels, dim=0)
        # splitting the data to train, validation, and test
        return TensorDataset(source_ids, source_masks, target_ids, target_labels)
    
    def prepare_data(self):
        # splitting the data to train, validation, and test
        data = self.prepare_df(self.data)
        train_size, val_size = int(0.8 * len(data)), int(0.1 * len(data))
        test_size = len(data) - (train_size + val_size)
        self.train_dat, self.val_dat, self.test_dat = \
            random_split(data, [train_size, val_size, test_size])
    
    def forward(self, batch, batch_idx):
        source_ids, source_mask, target_ids, target_labels = batch[:4]
        return self.model(input_ids = source_ids, attention_mask = source_mask, 
                          decoder_input_ids=target_ids, labels=target_labels)
        
    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss, 'log': {'train_loss': loss}}
    
    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        return self(batch, batch_idx)
        return " ".join(map(self.decode, y_hat))

    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss, 'val_loss': loss}

    def validation_epoch_end(self, outputs):
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'val_loss': loss}
        return {**out, 'log': out}

    def test_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {'loss': loss}

    def test_epoch_end(self, outputs):
        loss = sum([o['loss'] for o in outputs]) / len(outputs)
        out = {'test_loss': loss}
        return {**out, 'log': out}
    
    def train_dataloader(self):
        return DataLoader(self.train_dat, batch_size=self.batch_size,
                          num_workers=4, sampler=RandomSampler(self.train_dat))

    def val_dataloader(self):
        return DataLoader(self.val_dat, batch_size=self.args.bs, num_workers=4,
                          sampler=SequentialSampler(self.val_dat))

    def test_dataloader(self):
        return DataLoader(self.test_dat, batch_size=self.args.bs, num_workers=4,
                          sampler=SequentialSampler(self.test_dat))    

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.args.lr, eps=1e-4)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0,
            num_training_steps=self.args.max_epochs * len(self.train_dat))
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}
    
    def generate_summary(self, ctext, summ_len=150, text='', beam_search=2, repetition_penalty=2.5):
        source_id, source_mask, target_id, target_label = self.encode_text(ctext, text)
        self.model.eval()
        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids = source_id,
                attention_mask = source_mask, 
                max_length=summ_len, 
                truncation=True,
                num_beams=beam_search,
                repetition_penalty=repetition_penalty, 
                length_penalty=1.0, 
                early_stopping=True
                )
            prediction = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        if len(text) > 0:
            target = [self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in target_id]
            scores = self.scorer.score(target[0], prediction[0])
            return prediction, scores
        else:
            return prediction
        

    def save_core_model(self):
        store_path = join(self.args.output, self.args.name, 'core')
        self.model.save_pretrained(store_path)
        self.tokenizer.save_pretrained(store_path)
        
    @staticmethod
    def add_model_specific_args(parent_parser):
        p = ArgumentParser(parents=[parent_parser], add_help=False)
        p.add_argument('-m', '--model', type=str, default='t5-base',
                       help='name of the model or the path pointing to it')
        p.add_argument('--bs', '--batch_size', type=int, default=2)
        p.add_argument('--source_len', type=int, default=120)
        p.add_argument('--summ_len', type=int, default=700)
        return p

In [8]:
def default_args():
    p = ArgumentParser()
    p.add_argument('-p', '--path', type=str,  
                   default='/content/gdrive/My Drive/Colab Notebooks/data/text_summarization_t5/news_summary.csv',
                  help='path to the data file')
    p.add_argument('-o', '--output', type=str, default='/tmp/tpu-template',
                  help='path to the output directory for storing the model')
    p.add_argument('-n', '--name', type=str, default='google/t5-v1_1-xxl',
                  help='this name will be used on tensorboard for the model')
    p.add_argument('-t', '--trials', type=int, default=1,
                  help='number of trials for hyperparameter search')
    p.add_argument('--seed', type=int, default=0, help='randomization seed')
    p = T5Finetuner.add_model_specific_args(p)
    p = pl.Trainer.add_argparse_args(p)
    args,_ = p.parse_known_args()
    args.max_epochs = 2
    return args

def default_args():
    p = ArgumentParser()
    args,_ = p.parse_known_args()
    args.max_epochs = 2
    #args.model = "google/flan-t5-small"
    #args.model = "google/flan-t5-xl"
    args.model = "google/flan-t5-large"
    #args.model = "google/flan-t5-base"
    args.output = f"./{args.model.replace('/','_')}"
    args.name = "DESCRIPCION_PROPIEDADES"
    args.bs = 1 # batch size
    return args

def optuna_objective(trial):
    args = default_args()
    # sampling the hyperparameters
    args.lr = trial.suggest_categorical("lr", [1e-6, 5e-6, 1e-5, 5e-5, 1e-4])
    # setting up the right callbacks
    cp_callback = pl.callbacks.ModelCheckpoint(
        join(args.output, args.name, f"trial_{trial.number}", "{epoch}"),
        monitor="val_loss", mode="min")
    pr_callback = PyTorchLightningPruningCallback(trial, monitor="val_loss")
    metrics_callback = MetricsCallback()
    summarizer = T5Finetuner(args, df_text)         # loading the model
    trainer = pl.Trainer.from_argparse_args(      # loading the trainer
        args, 
        accelerator="gpu",
        devices=1,
        default_root_dir=args.output, gradient_clip_val=1.0,
        #checkpoint_callback=cp_callback,
        callbacks=[metrics_callback],
        #early_stop_callback=pr_callback, 
        num_sanity_val_steps=-1,
        #auto_scale_batch_size="power",
        # select TensorBoad or Wandb logger
        logger=TensorBoardLogger(join(args.output, 'logs'), name=args.name, version=f'trial_{trial.number}')
        )
  
    trainer.fit(summarizer)                       # fitting the model
    trainer.test(summarizer)                      # testing the model
    return min([x['val_loss'].item() for x in metrics_callback.metrics])

In [9]:
#NEW_MODEL="/home/gvillarroel/dev/synthetic-data-for-text/notebooks/google_flan-t5-base/logs/DESCRIPCION_PROPIEDADES/trial_0/checkpoints/epoch=1-step=436696.ckpt"
NEW_MODEL = "/home/gvillarroel/dev/synthetic-data-for-text/notebooks/google_flan-t5-large/logs/DESCRIPCION_PROPIEDADES/trial_0/checkpoints/epoch=1-step=873392.ckpt"
new_model = T5Finetuner.load_from_checkpoint(NEW_MODEL)



In [28]:
from textwrap import wrap
from numpy import random

new_model.model.eval()
new_model.model.cuda()
record = df_text.sample(1).iloc[0]
def gen_new_text(record):
    #encoded = new_model.tokenizer.batch_encode_plus(record.text, max_length= MAX_SRC_LEN, truncation=False,padding=False,return_tensors='pt')
    encoded = new_model.tokenizer.batch_encode_plus(record.text, 
                                                max_length= 120, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
    #encoded = new_model.tokenizer("<sep>".join(record.text), return_tensors="pt")
    
    max_length= random.randint(200, 600)
    min_length = random.randint(20, max_length-5)
    input_ids =encoded['input_ids'].cuda() 
    att = encoded['attention_mask'].cuda()
    y_hat = new_model.model.generate(
        inputs= input_ids,
        attention_mask=att,
        num_beams=1,
        min_length=min_length,
        max_length=max_length,
        repetition_penalty=5.5,
        length_penalty=0.1,
        early_stopping=True,
        temperature=random.random(),
        use_cache=False,
        top_p=random.random(),
        top_k=0,
        do_sample=True
    )
    del input_ids
    del att
    return " ".join([new_model.tokenizer.decode(gen_id, skip_special_tokens=True) for gen_id in y_hat])

display(" ".join(record.text))
display(wrap(gen_new_text(record)))
display(wrap(record.target))

'Describe 2019-09-03 precio 6850 UF tipo Departamento transacción Venta región Metropolitana de Santiago comuna Ñuñoa dormitorios 2.0 baños 2.0 constuidos 105.0 terreno nan precio_real 6850.0'

['Si quieres una increible experiencia, está la mejor oportunidad de que',
 'no tienes todo! Ubicados just antes del centro comercial Santa Isabel.',
 'A pasó se pude ver este hermoso punto con sus siguientes',
 'atractivitudes: •Piscina para adultos •Club house (con',
 'bajadas)•Gimnacio equipamiental completamente amable •Area verde',
 'interior tipico “L”(por su formación), idealmente disfrutando al mismo',
 'moment. Para mayor informacion contactarnos por email y/o whattapp +56',
 '9 744 84794 departamento en venta, 2 dormitorios (principal con',
 'walking closet), bao principal completamente remodelados. Living',
 'comedor separador de ambiente, cocina americana equipada y amuéblada',
 'con logia incorporada, estar familiar que pude ser utilizado tambièn',
 'para oficion de salón multiuso, piscina temperada al interior del',
 'condominio más un quinchero mujerizado por la comunidad de los',
 'propietarios durante todas sus tardes; accesible controlanando su',
 'ingreso 24/7']

['VENDO DEPARTAMENTO DE 2 DORMITORIOS 2 BAÑOS CON LOGGIA Y 2',
 'ESTACIONAMIENTOS Y 1 BODEGA DE 102 MTS2  Superficie Interior: 84.4 m2',
 'Superficie Logia: 2.5 m2 Superficie Terraza: 15.0 m2 Superficie Total:',
 '101.9 m2  VALOR VENTA 6850 UF  FICHA EDIFICIO CHILE ESPAÑA  • Moderno',
 'hall de acceso en doble altura y recepción finamente amoblada  • Gran',
 'conserjería para control de acceso vehicular y peatonal  • 3 Amplios',
 'ascensores Mitsubishi de última tecnología con sincronización',
 'simultánea y puertas de apertura central  • Grupo electrógeno de',
 'emergencia para ascensores y algunos espacios comunes  • Circuito',
 'cerrado de TV y grabación de imágenes con múltiples cámaras en',
 'accesos y ascensores  • 2 Amplias salas multiuso y sala de juegos  •',
 'Gimnasio equipado  • Bicicenter  • Lavandería  • Piscina exterior en',
 'primer piso  • Áreas verdes y terrazas  • Baños y cocina para',
 'personal de servicio  • Gran cantidad de estacionamientos para vi

In [11]:
#import swifter
data_g  = df_text.head(1000).apply(gen_new_text, axis=1)
df_new_text = df_text.head(1000).assign(generated=data_g)
df_new_text.to_parquet("m2.parquet")

In [29]:
import polars as prs
df_text.to_parquet("text.parquet")
df_p_text = prs.read_parquet("text.parquet")

In [34]:
def gen_new_text_p(record):
    #encoded = new_model.tokenizer.batch_encode_plus(record.text, max_length= MAX_SRC_LEN, truncation=False,padding=False,return_tensors='pt')
    encoded = new_model.tokenizer.batch_encode_plus(record[0], 
                                                max_length= 120, 
                                                truncation=True,
                                                padding='max_length',
                                                return_tensors='pt')
    #encoded = new_model.tokenizer("<sep>".join(record.text), return_tensors="pt")
    
    max_length= random.randint(200, 600)
    min_length = random.randint(20, max_length-5)
    input_ids =encoded['input_ids'].cuda() 
    att = encoded['attention_mask'].cuda()
    y_hat = new_model.model.generate(
        inputs= input_ids,
        attention_mask=att,
        num_beams=1,
        min_length=min_length,
        max_length=max_length,
        repetition_penalty=5.5,
        length_penalty=0.1,
        early_stopping=True,
        temperature=random.random(),
        use_cache=False,
        top_p=random.random(),
        top_k=0,
        do_sample=True
    )
    del input_ids
    del att
    return " ".join([new_model.tokenizer.decode(gen_id, skip_special_tokens=True) for gen_id in y_hat])
#df_out = df_p_text.head(1000).apply(gen_new_text_p)