In [5]:
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
import os

import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Clase para cargar el tokenizador
from transformers import AutoTokenizer
# Clase para secuencias a clasificación (N -> 1)
from transformers import AutoModelForSequenceClassification

# clases para entrenamiento
from transformers import Trainer, TrainingArguments

# Numpy
import numpy as np

# operative system
import os

from transformers import pipeline                                                   



In [2]:
DATA_TRAIN = "train_data.parquet"
DATA_TEST = "test_data.parquet"
# Carga de datasets
df_train = pd.read_parquet(DATA_TRAIN).reset_index(drop=True)
df_test = pd.read_parquet(DATA_TEST).reset_index(drop=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
MODEL_PATH="best_bathrooms/pytorch_model.bin"
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-uncased"

NUM_LABELS = 1

In [8]:
model = (AutoModelForSequenceClassification
         .from_pretrained(MODEL_NAME, num_labels=NUM_LABELS))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

In [9]:
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [18]:
pipe = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
display([df_test.iloc[0].description, df_test.iloc[1].description])
display(pipe([df_test.iloc[0].description, df_test.iloc[1].description]))

['Excelente casa grande con mas de 400m2, 4 habitaciones, 2 baños, sala, cocina, comedor, estacionamiento para 3, jardin y terreno. Buena ubicación a 2 cuadras de estacion del metro',
 'Departamento ubicado en Condominio El Jardín del Llano. 3 dormitorios, 2 baños, estacionamiento (uso y goce), bodega. 72 m2 aprox.']

[[[2.006520986557007]], [[1.9985408782958984]]]

'Excelente casa grande con mas de 400m2, 4 habitaciones, 2 baños, sala, cocina, comedor, estacionamiento para 3, jardin y terreno. Buena ubicación a 2 cuadras de estacion del metro'

In [14]:
display(df_test.iloc[0].description)
display(pipe(df_test.iloc[0].description))

'Excelente casa grande con mas de 400m2, 4 habitaciones, 2 baños, sala, cocina, comedor, estacionamiento para 3, jardin y terreno. Buena ubicación a 2 cuadras de estacion del metro'

[{'label': 'LABEL_0', 'score': 0.8814799785614014}]

In [15]:
display(df_test.iloc[1].description)
display(pipe(df_test.iloc[1].description))

'Departamento ubicado en Condominio El Jardín del Llano. 3 dormitorios, 2 baños, estacionamiento (uso y goce), bodega. 72 m2 aprox.'

[{'label': 'LABEL_0', 'score': 0.8806438446044922}]

In [None]:
def collate_fn(batch):
    return tuple(zip(*batch))

# Dataset parecido al de entrenamiento, pero dedicado a predecir
class PandasDatasetTest(torch.utils.data.Dataset):
  def __init__(self, df, x, y, tokenizer):    
    self.x = df[x]
    self.y = df[y]
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.x)

  def __getitem__(self, ix):
    data = self.tokenizer(self.x[ix], truncation=True, padding="max_length", max_length=512)
    return data["input_ids"], data["attention_mask"]

test_dataloader = torch.utils.data.DataLoader(PandasDatasetTest(df_test, "description", "bathrooms", tokenizer), batch_size=20, collate_fn=collate_fn)

In [None]:
history_models = {}

if not TRAIN_MODE:
  model.load_state_dict(torch.load(MODEL_PATH))

# Dejo al modelo en modo evaluación (evita calculos innecesarios)
model.eval()

# donde guardo los resultados
s_results = []
# itero por los batches del dataset de test
for item in test_dataloader:
  # Obtengo el batch de input_ids dimencion (B, 23, )
  # 23 es el largo debido al padding
  input_ids = torch.tensor(item[0]).to(device)
  # Obtengo la mascara usada (B, 23, )
  attention_mask = torch.tensor(item[1]).to(device)
  # Obtengo el resultado 
  result = model(input_ids, attention_mask=attention_mask)
  s_results.append([float(x) for x in result.logits])
  # Borro los elemetos, importante para liberar memoria
  del input_ids
  del attention_mask
  del result

df["y_pred"] = [round(x) for x in np.concatenate([np.array(i) for i in s_results])]
df["y_pred_float"] = np.concatenate([np.array(i) for i in s_results])