In [1]:
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
import os

import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Clase para cargar el tokenizador
from transformers import AutoTokenizer
# Clase para secuencias a clasificación (N -> 1)
from transformers import AutoModelForSequenceClassification

# clases para entrenamiento
from transformers import Trainer, TrainingArguments

# Numpy
import numpy as np

# operative system
import os


In [2]:
TRAIN_FILE = "train_data.parquet"
TEST_FILE = "test_data.parquet"

In [3]:
df_train = pd.read_parquet(TRAIN_FILE)
df_test = pd.read_parquet(TEST_FILE)



In [4]:
import re
def get_price(price_text):
    value = int('0' + re.sub('[^0-9]', '', str(price_text)))

    if "uf" in price_text.lower():
        return value * 31000
    return value
# Convierto precio a numeros
df_train["r_price"] = df_train["price"].map(get_price)
df_test["r_price"] = df_test["price"].map(get_price)


In [5]:
df_train_venta = df_train[df_train.operation == "Venta"].reset_index(drop=True)
df_train_arriendo = df_train[df_train.operation == "Arriendo"].reset_index(drop=True)

df_test_venta = df_test[df_test.operation == "Venta"].reset_index(drop=True)
df_test_arriendo = df_test[df_test.operation == "Arriendo"].reset_index(drop=True)

In [6]:
# Selecciono el modelo base y creo un tokenizador para realizar pruebas
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Imprimo el tamaño de la vocabulario
display(tokenizer.vocab_size)
# Imprimo los token especiales
display(tokenizer.special_tokens_map)

31002

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [7]:
# constante para entrenar o no
TRAIN_MODE = False
#MODEL_PATH="best_price/pytorch_model.bin"
MODEL_PATH ="results/checkpoint-24372/pytorch_model.bin"
#SNAPSHOT="results/checkpoint-45846/pytorch_model.bin"
SNAPSHOT=None
TRAIN_EPOCHS=10
# Cuda si existe grafica, cpu si no
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
# Creo el dataset, los minimos elementos a implementar son
# __init__, __len__ y __getitem__
# esto es porque itera con un for simple
class PandasDataset(torch.utils.data.Dataset):
  def __init__(self, df, x, y, tokenizer):    
    self.x = df[x]
    self.y = df[y]
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.x)

  def __getitem__(self, ix):
    return {
                **self.tokenizer(self.x[ix], truncation=True, padding="max_length", max_length=300),
                **{"label": self.y[ix], "text": self.x[ix]}
            }

In [9]:
# Creo el modelo para clasificación, para el numero de label de nuestro problema
# model_name es el nombre del modelo que quiero usar como base
num_labels = 1
model_venta = (AutoModelForSequenceClassification
         .from_pretrained(MODEL_NAME, num_labels=num_labels)
         .to(device))
model_arriendo = (AutoModelForSequenceClassification
         .from_pretrained(MODEL_NAME, num_labels=num_labels)
         .to(device))

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [10]:
# Determino los parametros del entrenamiento
BATCH_SIZE = 40
logging_steps = len(df_train) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=TRAIN_EPOCHS,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  load_best_model_at_end=True,
                                  #metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,)

In [11]:
trainer = Trainer(model=model_arriendo, args=training_args, train_dataset=PandasDataset(df_train_arriendo, "description", "r_price", tokenizer), eval_dataset=PandasDataset(df_test_arriendo, "description", "r_price", tokenizer))
if TRAIN_MODE:
    trainer.train();

results/checkpoint-2708

In [12]:
def collate_fn(batch):
    return tuple(zip(*batch))

# Dataset parecido al de entrenamiento, pero dedicado a predecir
class PandasDatasetTest(torch.utils.data.Dataset):
  def __init__(self, df, x, y, tokenizer):    
    self.x = df[x]
    self.y = df[y]
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.x)

  def __getitem__(self, ix):
    data = self.tokenizer(self.x[ix], truncation=True, padding="max_length", max_length=512)
    return data["input_ids"], data["attention_mask"]

test_dataloader = torch.utils.data.DataLoader(PandasDatasetTest(df_test_arriendo, "description", "r_price", tokenizer), batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [13]:
history_models = {}

model = model_arriendo

if not TRAIN_MODE:
  model.load_state_dict(torch.load(MODEL_PATH))

# Dejo al modelo en modo evaluación (evita calculos innecesarios)
model.eval()

# donde guardo los resultados
s_results = []
# itero por los batches del dataset de test
for item in test_dataloader:
  # Obtengo el batch de input_ids dimencion (B, 23, )
  # 23 es el largo debido al padding
  input_ids = torch.tensor(item[0]).to(device)
  # Obtengo la mascara usada (B, 23, )
  attention_mask = torch.tensor(item[1]).to(device)
  # Obtengo el resultado 
  result = model(input_ids, attention_mask=attention_mask)
  s_results.append([float(x) for x in result.logits])
  # Borro los elemetos, importante para liberar memoria
  del input_ids
  del attention_mask
  del result



In [14]:
df = df_test_arriendo

df["y_pred"] = [round(x) for x in np.concatenate([np.array(i) for i in s_results])]
df["y_pred_float"] = np.concatenate([np.array(i) for i in s_results])
# Muestro los que no fueron bien asignados
x_labels = list(range(1,8,1))
y_labels = list(range(1,8,1))
matrix = np.zeros((len(x_labels), len(y_labels)))
for x in x_labels:
  for y in y_labels: 
    matrix[x-1,y-1] = df.loc[(df.rooms == x) & (round(df.y_pred_float) == y)].shape[0]
# Muestro la matrix
display(pd.DataFrame(matrix, index=x_labels, columns=y_labels))

Unnamed: 0,1,2,3,4,5,6,7
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Sección de pruebas
record = df_test_arriendo.sample(1).iloc[0]
TEXTO_PRUEBA=record.description
# Obtengo el vector y la mascara de uso
input_ids, _, attention_mask = tokenizer(TEXTO_PRUEBA).values()
# Paso a memoria de la grafica
input_ids, attention_mask = torch.tensor(input_ids).unsqueeze(0).to(device), torch.tensor(attention_mask).unsqueeze(0).to(device)
# Calculo
result = model(input_ids, attention_mask=attention_mask)

print("="*30)
display(record.description)
display(record.url)
print("="*30)
display(result)
print(f"y={record.r_price}")
print(f"y={record.n_price}")
print(f"y_pred={record.y_pred_float}")
print("="*30)
del result
del input_ids
del attention_mask



'DOSQUE & RIQUELME PROPIEDADES ARRIENDA\n\nCaracterísticas de la unidad.  \nDEPARTAMENTO CON VISTA ORIENTE, PISO 27, DE 2 DORMITORIOS, 1 BAÑOS, COCINA AMERICANA AMOBLADA Y EQUIPADA, PISO CERÁMICO EN ZONAS HÚMEDAS, AGUA CALIENTE POR TERMO ELÉCTRICO, INSTALACION DE LAVADORA Y BALCÓN.\n\nNO INCLUYE ESTACIONAMIENTO\nNO INCLUYE BODEGA\nGGCC APROX $50.000\n\nCaracterísticas del Edificio.  \nHALL DE ACCESO EN DOBLE ALTURA, CONSERJERÍA, SALA MULTIUSO, PISCINA, GIMNASIO, LAVANDERÍA, QUINCHOS, ESTACIONAMIENTOS DE VISITA, SEGURIDAD CONTROLADA LAS 24 HORAS CON CÁMARAS DE VIGILANCIA.  \n \nInformación Adicional.  \nGRAN CONECTIVIDAD DE LOCOMOCIÓN COLECTIVA MUY CERCA DE METRO ESTACIÓN IRARRÁZAVAL, SUPERMERCADOS, COMERCIO, OUTLET, PARQUE BUSTAMANTE, BANCOS, ETC. \n\nSOLICITE REQUISITOS Y CONDICIONES AL FORMULARIO DE CONTACTO\nPARA ASEGURAR UNA ATENCIÓN APROPIADA, FAVOR LLAMAR DE LUNES A VIERNES ENTRE LAS 10:00 Y 18:00 HORAS\n'

'https://www.economicos.cl/propiedades/arriendo-2d1b-en-vicuna-mackenna-1207-codAAM7KZI.html'



SequenceClassifierOutput(loss=None, logits=tensor([[216.5208]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

y=320000
y=126
y_pred=216.52076721191406


In [22]:
display(df_train.shape)
display(df_test.shape)

(216403, 18)

(24045, 18)

In [16]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(df_test_arriendo.r_price.values, df_test_arriendo.y_pred.values))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      11.0
           1       0.00      0.00      0.00       2.0
           3       0.00      0.00      0.00       1.0
          22       0.00      0.00      0.00       1.0
          30       0.00      0.00      0.00       2.0
         100       0.00      0.00      0.00       1.0
         150       0.00      0.00      0.00       1.0
         160       0.00      0.00      0.00       1.0
         200       0.00      0.00      0.00       1.0
         217       0.00      0.00      0.00       0.0
         220       0.00      0.00      0.00       1.0
         240       0.00      0.00      0.00       1.0
         300       0.00      0.00      0.00       1.0
         340       0.00      0.00      0.00       1.0
         350       0.00      0.00      0.00       1.0
         360       0.00      0.00      0.00       1.0
         400       0.00      0.00      0.00       2.0
         420       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
df_test[(df_test.rooms == df_test.y_pred_float.round())].shape[0] / df_test.shape[0]

AttributeError: 'DataFrame' object has no attribute 'y_pred_float'

In [None]:
df_test[(df_test.rooms != df_test.y_pred)].to_csv("no_match.csv", index=False)
df_test.loc[(df_test.rooms != df_test.y_pred_float.round()), ['description', 'rooms', 'y_pred', 'y_pred_float']]

In [None]:
df_test[(df_test.rooms != df_test.y_pred_float.round())].iloc[0].description