In [1]:
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
import os

import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Clase para cargar el tokenizador
from transformers import AutoTokenizer
# Clase para secuencias a clasificación (N -> 1)
from transformers import AutoModelForSequenceClassification

# clases para entrenamiento
from transformers import Trainer, TrainingArguments

# Numpy
import numpy as np

# operative system
import os


In [61]:
from math import log
log(130000000)

18.683045008419857

In [2]:
DATA_FOLDER="../data/economicos/data_cruda"
df = pd.concat([pd.read_json(f"{DATA_FOLDER}/{imgfile}") for imgfile in os.listdir(DATA_FOLDER)])
df.to_parquet("../data/economicos/join.parquet")


In [6]:
DATA_FILE = "../data/economicos/join.parquet"

In [7]:
df = pd.read_parquet(DATA_FILE)
df = df.rename(columns = {
    'url': 'url',
    'description': 'description',
    'price': 'price',
    'title': 'title',
    'address': 'address',
    'images': 'images',
    'Tipo:': 'type',
    'Operación:': 'operation',
    'm  construidos:': 'm_built',
    'm  terreno:': 'm_size',
    'Región:': 'state',
    'Comuna:': 'county',
    'Fecha Publicación:': 'date',
    'Diario:': 'source',
    'Dormitorios:':'rooms',
    'Baños:': 'bathrooms'
})


In [13]:
filter = ((df.state == 'Metropolitana de Santiago') & (df['type'].isin(['Departamento', 'Casa', 'Departamento Amoblado'])) & (df['operation'].isin(['Arriendo', 'Venta'])) )
#columns = ['description', 'price', 'm_built', 'm_size', 'county', 'date', 'rooms', 'bathrooms', 'title', 'url', 'operation']
stgo = df[filter].drop_duplicates(subset='url').reset_index(drop=True)
display(stgo.shape[0])

240448

In [79]:
import re
from math import log
stgo["n_price"] = stgo.price.map(lambda x: int(
    log(
        int('0' + re.sub('[^0-9]', '', str(x)) )+1
        )*10
        )
        )

In [None]:
stgo.price.head().map(lambda x: re.sub('[^0-9]', '', str(x)))

In [104]:
k = pd.DataFrame(stgo.n_price.value_counts())
stgo.loc[stgo.n_price.isin(k[k.n_price==1].index.to_list()), "n_price"] = 1000

In [105]:
df_train, df_test = train_test_split(stgo, test_size=0.1, stratify=stgo.n_price)

In [107]:
df_train.to_parquet("train_data.parquet")
df_test.to_parquet("test_data.parquet")

In [108]:
display(df_train.shape)
display(df_test.shape)

(216403, 17)

(24045, 17)

In [None]:
display(stgo.description.iloc[0])
display(stgo.rooms.iloc[0])

In [None]:
display(stgo[(stgo.rooms<1) | (stgo.rooms>6)].shape[0])
stgo_rooms = stgo[(stgo.rooms>=1) & (stgo.rooms<=6)]
display(stgo_rooms.shape[0])

In [None]:
display(stgo_rooms.rooms.agg(['min', 'max']))
stgo_rooms['n_rooms'] = (stgo_rooms.rooms-stgo_rooms.rooms.min())/(stgo_rooms.rooms.max()-stgo_rooms.rooms.min())

display(stgo_rooms.n_rooms.agg(['min', 'max']))

In [None]:
# Selecciono el modelo base y creo un tokenizador para realizar pruebas
model_name = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Imprimo el tamaño de la vocabulario
display(tokenizer.vocab_size)
# Imprimo los token especiales
display(tokenizer.special_tokens_map)

In [None]:
# constante para entrenar o no
TRAIN_MODE = True
MODEL_PATH="best_rooms/15282-1.34102135/pytorch_model.bin"
#SNAPSHOT="results/checkpoint-45846/pytorch_model.bin"
SNAPSHOT=None
TRAIN_EPOCHS=20
# Cuda si existe grafica, cpu si no
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Carga de datasets
df_train, df_test = train_test_split(stgo_rooms, test_size=0.1, stratify=stgo_rooms.rooms)

In [None]:
# reseteo el index, importante para crear los datasets
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train.description.map(len).max()

In [None]:
# Creo el dataset, los minimos elementos a implementar son
# __init__, __len__ y __getitem__
# esto es porque itera con un for simple
class PandasDataset(torch.utils.data.Dataset):
  def __init__(self, df, x, y, tokenizer):    
    self.x = df[x]
    self.y = df[y]
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.x)

  def __getitem__(self, ix):
    return {
                **self.tokenizer(self.x[ix], truncation=True, padding="max_length", max_length=300),
                **{"label": self.y[ix], "text": self.x[ix]}
            }

In [None]:
# Creo el modelo para clasificación, para el numero de label de nuestro problema
# model_name es el nombre del modelo que quiero usar como base
num_labels = 1
model = (AutoModelForSequenceClassification
         .from_pretrained(model_name, num_labels=num_labels)
         .to(device))
if SNAPSHOT:
    model.load_state_dict(torch.load(SNAPSHOT))

In [None]:
# Determino los parametros del entrenamiento
BATCH_SIZE = 40
logging_steps = len(df_train) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=TRAIN_EPOCHS,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  load_best_model_at_end=True,
                                  #metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,)

In [None]:
trainer = Trainer(model=model, args=training_args, train_dataset=PandasDataset(df_train, "description", "rooms", tokenizer), eval_dataset=PandasDataset(df_test, "description", "rooms", tokenizer))
if TRAIN_MODE:
  trainer.train();

results/checkpoint-4284

In [None]:
def collate_fn(batch):
    return tuple(zip(*batch))

# Dataset parecido al de entrenamiento, pero dedicado a predecir
class PandasDatasetTest(torch.utils.data.Dataset):
  def __init__(self, df, x, y, tokenizer):    
    self.x = df[x]
    self.y = df[y]
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.x)

  def __getitem__(self, ix):
    data = self.tokenizer(self.x[ix], truncation=True, padding="max_length", max_length=512)
    return data["input_ids"], data["attention_mask"]

test_dataloader = torch.utils.data.DataLoader(PandasDatasetTest(df_test, "description", "rooms", tokenizer), batch_size=BATCH_SIZE, collate_fn=collate_fn)

In [None]:
history_models = {}

if not TRAIN_MODE:
  model.load_state_dict(torch.load(MODEL_PATH))

# Dejo al modelo en modo evaluación (evita calculos innecesarios)
model.eval()

# donde guardo los resultados
s_results = []
# itero por los batches del dataset de test
for item in test_dataloader:
  # Obtengo el batch de input_ids dimencion (B, 23, )
  # 23 es el largo debido al padding
  input_ids = torch.tensor(item[0]).to(device)
  # Obtengo la mascara usada (B, 23, )
  attention_mask = torch.tensor(item[1]).to(device)
  # Obtengo el resultado 
  result = model(input_ids, attention_mask=attention_mask)
  s_results.append([float(x) for x in result.logits])
  # Borro los elemetos, importante para liberar memoria
  del input_ids
  del attention_mask
  del result



In [None]:
df = df_test

df["y_pred"] = [round(x) for x in np.concatenate([np.array(i) for i in s_results])]
df["y_pred_float"] = np.concatenate([np.array(i) for i in s_results])
# Muestro los que no fueron bien asignados
x_labels = list(range(1,8,1))
y_labels = list(range(1,8,1))
matrix = np.zeros((len(x_labels), len(y_labels)))
for x in x_labels:
  for y in y_labels: 
    matrix[x-1,y-1] = df.loc[(df.rooms == x) & (round(df.y_pred_float) == y)].shape[0]
# Muestro la matrix
display(pd.DataFrame(matrix, index=x_labels, columns=y_labels))

In [None]:
# Sección de pruebas
record = df_test[(df_test.rooms != df_test.y_pred_float.round())].iloc[6]
TEXTO_PRUEBA=record.description
# Obtengo el vector y la mascara de uso
input_ids, _, attention_mask = tokenizer(TEXTO_PRUEBA).values()
# Paso a memoria de la grafica
input_ids, attention_mask = torch.tensor(input_ids).unsqueeze(0).to(device), torch.tensor(attention_mask).unsqueeze(0).to(device)
# Calculo
result = model(input_ids, attention_mask=attention_mask)

print("="*30)
display(record.description)
display(record.url)
print("="*30)
display(result)
print(f"y={record.rooms}")
print(f"y_pred={record.y_pred_float}")
print("="*30)
del result
del input_ids
del attention_mask

In [None]:
df_test[(df_test.rooms == df_test.y_pred_float.round())].shape[0] / df_test.shape[0]

In [None]:
df_test[(df_test.rooms != df_test.y_pred)].to_csv("no_match.csv", index=False)
df_test.loc[(df_test.rooms != df_test.y_pred_float.round()), ['description', 'rooms', 'y_pred', 'y_pred_float']]

In [None]:
df_test[(df_test.rooms != df_test.y_pred_float.round())].iloc[0].description