In [None]:
import pandas as pd

In [None]:
avisos = pd.read_csv('avisos.csv', index_col=0)

In [None]:
avisos=avisos.drop(['provincia', 'iscurrent'], axis = 1)

In [None]:
avisos_venta = avisos[avisos['tipodeoperacion']=='Venta']
avisos = avisos[avisos['tipodeoperacion']!='Venta']

In [None]:
avisos['precio_norm'] = (avisos.precio - avisos.precio.mean())/avisos.precio.std()
avisos_venta['precio_norm'] = (avisos_venta.precio - avisos_venta.precio.mean())/avisos_venta.precio.std()

In [None]:
avisos = pd.concat([avisos, avisos_venta])

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical_features = ['ciudad', 'tipodeoperacion', 'tipodepropiedad']
# Le aplico label encodding a cada variable categorica y me guardo los diccionarios en la lista mappings.
mappings = []
for col in categorical_features:
    le = LabelEncoder()
    avisos[col + '_encoded'] = le.fit_transform(avisos[col])
    #avisos[col + '_encoded'] = avisos[col + '_encoded'].astype('category')
    mappings.append(dict(zip(le.classes_, range(len(le.classes_)))))

In [None]:
avisos = avisos.reset_index(drop=True)

In [None]:
avisos['habitaciones'] = avisos.habitaciones.fillna(avisos.habitaciones.mean())
avisos['metrostotales'] = avisos.metrostotales.fillna(avisos.metrostotales.mean())

In [None]:
avisos['metrostotales_norm'] = (avisos['metrostotales']-avisos['metrostotales'].mean())/avisos['metrostotales'].std()
avisos['habitaciones_norm'] = (avisos['habitaciones']-avisos['habitaciones'].mean())/avisos['habitaciones'].std()


In [None]:
embeddings_ciudad_2d = pd.read_csv('embeddings_ciudad_2d.csv', index_col = 0)

In [None]:
avisos = avisos.merge(embeddings_ciudad_2d, on = 'ciudad', how = 'left')
avisos = avisos.rename(columns = {'t1':'ciudad_x', 't2':'ciudad_y' })

In [None]:
avisos = avisos.drop_duplicates('idaviso')

In [None]:
# Normalizo los embeddings (qué puede salir mal?)
avisos[['ciudad_x','ciudad_y']] = avisos[['ciudad_x','ciudad_y']] = (avisos[['ciudad_x','ciudad_y']]-avisos[['ciudad_x','ciudad_y']].mean())/avisos[['ciudad_x','ciudad_y']].std()

In [None]:
features = ['ciudad_x', 'ciudad_y', 'tipodeoperacion_encoded', 'tipodepropiedad_encoded', 'habitaciones_norm', 'metrostotales_norm', 'precio_norm']
len(features)

In [None]:
# Pesos de distintas variables

avisos['tipodeoperacion_encoded'] = avisos['tipodeoperacion_encoded']*100

avisos['precio_norm'] = avisos['precio_norm']

avisos[['ciudad_x','ciudad_y']] = avisos[['ciudad_x','ciudad_y']]*1

In [None]:
avisos = avisos.reset_index(drop=True)
X = avisos.filter(features).to_numpy()

In [None]:
from annoy import AnnoyIndex
#### Annoy

f = X.shape[1]  # len de los vectores

c = AnnoyIndex(f, 'manhattan')
n_embeddings = X.shape[0]

for i, emb in enumerate(X):
    c.add_item(i, emb)
    print('Progress: {}/{}'.format(i,n_embeddings), end = '\r')
print('Progress: {}/{} ... ฅ^•ﻌ•^ฅ OK!'.format(n_embeddings, n_embeddings), end = '\r')

c.build(100) # Número de árboles de ANN

In [None]:
def pos_to_itemid(pos):
    return avisos[avisos.index==pos].idaviso.item()

def itemid_to_pos(itemid):
    return avisos[avisos['idaviso']==itemid].index.item()

def items_similares(itemid, cantidad = 5, remove_original = True):
    pos = itemid_to_pos(itemid)
    similar_items = c.get_nns_by_item(pos, cantidad+1, include_distances=True)
    dist = similar_items[1]
    similar_items = [pos_to_itemid(p) for p in similar_items[0]]
    if remove_original:
        similar_items.remove(itemid)
    return similar_items, dist

def items_similares_from_features(features, cantidad):
    similar_items = c.get_nns_by_vector(features, cantidad+1, include_distances=True)
    similar_items = [pos_to_itemid(p) for p in similar_items]
    return similar_items

In [None]:
artificial = [-0.086216,-0.453138, 0, 0, 0.182190,-0.116949, -1.172157]

In [None]:
aviso = avisos.sample(1).idaviso.item()
avisos[avisos['idaviso']==aviso]

In [None]:
import numpy as np

In [None]:
%time similar, dist = items_similares(itemid=aviso, cantidad = 20, remove_original = False)
#similar = items_similares_from_features(artificial, 20)
res = avisos[avisos['idaviso'].isin(similar)].sort_values(['ciudad', 'habitaciones'])
res[np.array(dist)>-1]

In [None]:
dist

In [None]:
res[['ciudad', 'tipodepropiedad']].value_counts()

In [None]:
res.describe()

In [None]:
%time avisos.head(30).idaviso.apply(items_similares, remove_original=False)