In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
print("Hecho \n    por\n      fvadell\n          ^•ﻌ•^ฅ♡")

In [None]:
import numpy as np
import pandas as pd
import torch

#### Preparacion de datos

In [None]:
# Al final terminé usando sólo los leads pero bueno...
hits_file = '../../Data/hits_ZPAR.csv'
hits = pd.read_csv(hits_file)
leads_file = '../../Data/leads_ZPAR.csv'
leads = pd.read_csv(leads_file)

In [None]:
# Las columnas que debería tener el dataset son estas:
#['idaviso', 'ciudad', 'provincia', 'precio', 'tipodeoperacion',
#'tipodepropiedad', 'habitaciones', 'metrostotales', 'iscurrent',
#'idusuario', 'lead']

In [None]:
hits.shape, leads.shape

In [None]:
hits['lead'] = 0
leads['lead'] = 1

In [None]:
hits.head()

In [None]:
leads = leads[hits.columns]

In [None]:
leads.head()

In [None]:
hits_leads = pd.concat([hits, leads], ignore_index=True)

In [None]:
hits_leads.shape

In [None]:
# Si hay combinaciones de usuario - aviso repetidas me quedo con los leads unicamente
hits_leads = hits_leads.sort_values('lead', ascending = False).drop_duplicates(['idusuario', 'idaviso'])

In [None]:
# Me quedo con los usuarios que tienen entre 5 y 50 leads
v = leads.idusuario.value_counts()
leads = leads[leads.idusuario.isin(v.index[(v.gt(5))&(v.lt(50))])]
leads.head(1)

In [None]:
# Creo los leads falsos y los marco con la variable 'Match'
false_leads = leads.copy()
false_leads['idusuario'] = false_leads['idusuario'].sample(frac=1).values
leads['Match'] = 1
false_leads['Match'] = 0

In [None]:
leads.shape, false_leads.shape

In [None]:
leads = pd.concat([leads, false_leads])
del false_leads
leads.shape

In [None]:
# Ahora el dataframe de leads tiene leads verdaderos y leads falsos.
# El modelo va a tratar de predecir cuál es cuál, es decir 'Match'
userid = leads.idusuario.sample(1).item()
leads[leads['idusuario']==userid].sort_values('Match', ascending = False)

In [None]:
print('-- Caracteristicas de dataset --\n')
print('Total de filas: {}'.format(leads.shape[0]))
print('La mitad de las filas son leads falsos (la variable Match está en 0)')
print('Total de usuarios: {}'.format(leads.idusuario.nunique()))
print('En promedio cada usuario aparece {:.2f} veces'.format(leads.shape[0]/leads.idusuario.nunique()))
print('Total de avisos: {}'.format(leads.idaviso.nunique()))
print('En promedio cada aviso aparece {:.2f} veces'.format(leads.shape[0]/leads.idaviso.nunique()))
print('Total de ciudades: {}'.format(leads.ciudad.nunique()))

In [None]:
data = leads.copy()

#### Modelo

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class FeedForwardNN(nn.Module):

  def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
               output_size, emb_dropout, lin_layer_dropouts):

    """
    Parameters
    ----------

    emb_dims: Lista de tuplas. 
        Hay una tupla por cada variable
    categorica. La tupla contiene la cantidad de valores posibles
    de la variable y la dimension del embedding.

    no_of_cont: Integer
        Cantidad de variables numericas.

    lin_layer_sizes: Lista de enteros.
        Una lista con el  tamaño de cada capa lineal.

    output_size: Integer
        El tamaño de la capa de salida.

    emb_dropout: Float
        Dropout luego de cada capa de embeddings.

    lin_layer_dropouts: Lista de floats
        Dropout luego de cada capa lineal.
    """

    super().__init__()

    # Embedding layers
    self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                     for x, y in emb_dims])

    no_of_embs = sum([y for x, y in emb_dims])
    self.no_of_embs = no_of_embs
    self.no_of_cont = no_of_cont

    # Linear Layers
    first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
                                lin_layer_sizes[0])

    self.lin_layers =\
     nn.ModuleList([first_lin_layer] +\
          [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
           for i in range(len(lin_layer_sizes) - 1)])
    
    for lin_layer in self.lin_layers:
      nn.init.kaiming_normal_(lin_layer.weight.data)

    # Output Layer
    self.output_layer = nn.Linear(lin_layer_sizes[-1],
                                  output_size)
    nn.init.kaiming_normal_(self.output_layer.weight.data)

    # Batch Norm Layers
    self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
    self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
                                    for size in lin_layer_sizes])

    # Dropout Layers
    self.emb_dropout_layer = nn.Dropout(emb_dropout)
    self.droput_layers = nn.ModuleList([nn.Dropout(size)
                                  for size in lin_layer_dropouts])

  def forward(self, cont_data, cat_data):

    if self.no_of_embs != 0:
      x = [emb_layer(cat_data[:, i])
           for i,emb_layer in enumerate(self.emb_layers)]
      x = torch.cat(x, 1)
      x = self.emb_dropout_layer(x)

    if self.no_of_cont != 0:
      normalized_cont_data = self.first_bn_layer(cont_data)

      if self.no_of_embs != 0:
        x = torch.cat([x, normalized_cont_data], 1) 
      else:
        x = normalized_cont_data

    for lin_layer, dropout_layer, bn_layer in\
        zip(self.lin_layers, self.droput_layers, self.bn_layers):
        x = F.relu(lin_layer(x))
        x = bn_layer(x)
        x = dropout_layer(x)

    x = self.output_layer(x)

    return x

In [None]:
data.head(1)

In [None]:
categorical_features = ["idusuario", "idaviso", "ciudad", "tipodeoperacion"]
output_feature = "Match"

In [None]:
# Normalizo el precio
# Y trunco los valores por encima del percentil 90. Esto quizas es demasiado.
# TODO: Habría que normalizarlo por cada tipo de operacion para que esté bien.
data = data[categorical_features + ['precio'] + ['Match']]
m = data.precio.quantile(.90)
data['precio'] = data.precio.apply(lambda x: min(m, x))
data['precio'] = (data['precio']-data['precio'].mean())/data['precio'].std()
data.head()

In [None]:
data.precio.plot.hist(bins=30)

#### Label Encodding

In [None]:
from sklearn.preprocessing import LabelEncoder
# Le aplico label encodding a cada variable categorica y me guardo los diccionarios en la lista mappings.
mappings = []
for col in categorical_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    data[col] = data[col].astype('category')
    mappings.append(dict(zip(le.classes_, range(len(le.classes_)))))

#### Creo el Dataset de Pytorch

In [None]:
from torch.utils.data import Dataset, DataLoader


class TabularDataset(Dataset):
  def __init__(self, data, cat_cols=None, output_col=None, device='cpu'):
    """
    Characterizes a Dataset for PyTorch

    Parameters
    ----------

    data: pandas data frame
      The data frame object for the input data. It must
      contain all the continuous, categorical and the
      output columns to be used.

    cat_cols: List of strings
      The names of the categorical columns in the data.
      These columns will be passed through the embedding
      layers in the model. These columns must be
      label encoded beforehand. 

    output_col: string
      The name of the output variable column in the data
      provided.
    """

    self.n = data.shape[0]

    if output_col:
      self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
    else:
      self.y =  np.zeros((self.n, 1))
    self.y = torch.Tensor(self.y).to(device)

    self.cat_cols = cat_cols if cat_cols else []
    self.cont_cols = [col for col in data.columns
                      if col not in self.cat_cols + [output_col]]

    if self.cont_cols:
      self.cont_X = data[self.cont_cols].astype(np.float32).values
    else:
      self.cont_X = np.zeros((self.n, 1))
    self.cont_X = torch.Tensor(self.cont_X).to(device)

    if self.cat_cols:
      self.cat_X = data[cat_cols].astype(np.int64).values
    else:
      self.cat_X =  np.zeros((self.n, 1))
    self.cat_X = torch.Tensor(self.cat_X).to(device).int()

  def __len__(self):
    return self.n

  def __getitem__(self, idx):
    """
    One sample of data.
    """
    return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]

In [None]:
data.head()

In [None]:
# Uso GPU si se puede
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
dataset = TabularDataset(data=data, cat_cols=categorical_features,output_col=output_feature, device=device)

#### Dataloader

In [None]:
batchsize = 2**8
dataloader = DataLoader(dataset, batchsize, shuffle=True)

In [None]:
# Tamaño de los embeddings con maximo en 100.
# La formulita de (x+1)//2 la saque de algun lado
cat_dims = [int(data[col].nunique()) for col in categorical_features]
emb_dims = [(x, min(100, (x + 1) // 2)) for x in cat_dims]
emb_dims

In [None]:
model = FeedForwardNN(emb_dims, no_of_cont=1, lin_layer_sizes=[50, 100],
                          output_size=1, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)

In [None]:
def train(model, epochs=20, lr = 0.01):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        for y, cont_x, cat_x in dataloader:
            
            # Forward Pass
            preds = model(cont_x, cat_x)
            loss = criterion(preds, y)

            # Backward Pass and Optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch {} - Loss {:.5f}'.format(epoch, loss), end = '\r')
    print('Entrené en {} Epochs.  ฅ^•ﻌ•^ฅ OK!'.format(epochs), end = '\r')

#### Entrenamiento

Considerar que no estoy usando set de validación. Habría que hacerlo idealmente para meter un earlystopping

In [None]:
%time train(model, epochs=10)

In [None]:
# Veo la accuracy en un batch
y,cont,cat = next(iter(dataloader))
(model(cont, cat).round()==y).sum()/len(y)

In [None]:
categorical_features,cat

In [None]:
def get_user_position(userid):
    return mappings[0][userid]

def get_ciudad_position(ciudad):
    return mappings[2][ciudad]

pos_to_ciudad = dict(zip(mappings[2].values(), mappings[2].keys()))
pos_to_user_id = dict(zip(mappings[0].values(), mappings[0].keys()))

#### Annoy

Uso annoy para conseguir las ciudades más similares. También se podría hacer para los embeddings de usuarios y de items

In [None]:
from annoy import AnnoyIndex

f = 100  # len de los vectores

c = AnnoyIndex(f, 'euclidean')
n_embeddings = model.emb_layers.state_dict()['2.weight'].shape[0] # Busco los embeddings guardados en el modelo

for i, emb in enumerate(model.emb_layers.state_dict()['2.weight']):
    c.add_item(i, emb)
    print('Progress: {}/{}'.format(i,n_embeddings), end = '\r')
print('Progress: {}/{} ... ฅ^•ﻌ•^ฅ OK!'.format(n_embeddings, n_embeddings), end = '\r')

c.build(40) # Número de árboles de ANN

In [None]:
def ciudades_similares(ciudad, cantidad=4):
    pos = get_ciudad_position(ciudad)
    similar_ciudades = c.get_nns_by_item(pos,cantidad+1)
    similar_ciudades = [pos_to_ciudad[ciudad] for ciudad in similar_ciudades]
    similar_ciudades.remove(ciudad)
    return similar_ciudades

In [None]:
%time ciudades_similares('Núñez')