In [2]:
import numpy as np
import pandas as pd

# JSONTransformer

In [3]:
from unidecode import unidecode


class JSONTransformer:
    @staticmethod
    def clear(string):
        string = unidecode(string.lower().strip().replace(' ', '_'))

        return float(string) if string.replace('.', '', 1).isdigit() else string

    @staticmethod
    def preprocess_json(json):
        new_json = []

        for instance in filter(lambda i: i['tags'] is not None, json):
            new_instance = { 'id': str(instance['id']) }
            
            for (key, value) in map(lambda t: t.split(':'), instance['tags']):
                key = JSONTransformer.clear(key)

                if key not in new_instance:
                    new_instance[key] = JSONTransformer.clear(value)

            new_json.append(new_instance)

        return new_json

    @staticmethod
    def json_to_df(json):
        df_dict = {
            key: [None] * len(json)
            for instance in json
            for key in instance
        }

        for index, instance in enumerate(json):
            for key, value in instance.items():
                df_dict[key][index] = value

        return pd.DataFrame.from_dict(df_dict)

    def fit(self, X=None, y=None):
        return self
    
    def transform(self, X=None):
        preprocessed_json = JSONTransformer.preprocess_json(X)
        
        return JSONTransformer.json_to_df(preprocessed_json)

## Functions to transform a data frame

Probably this functions must be part of a pipeline transformer, but I haven't figured how to.

In [51]:
def object_cols_to_category(data_frame):
    # Set <object> dtype columns to <category>
    
    category_cols = data_frame.select_dtypes(include=['object']).columns.values
    
    for col in category_cols:
        data_frame[col] = data_frame[col].astype('category')

    return data_frame


def category_cols_to_codes(data_frame):
    # Set <category> dtype columns values to <int8>
    
    category_cols = data_frame.select_dtypes(include=['category']).columns.values
    
    for col in category_cols:
        cat_code_mapping = {
            cat: code
            for cat, code in zip(data_frame[col].cat.categories, data_frame[col].cat.codes)
        }
        
        print(col, list(zip(data_frame[col].cat.categories.values, data_frame[col].cat.codes)), cat_code_mapping, data_frame[col].cat.categories.values, data_frame[col].cat.codes)
        
        data_frame[col] = data_frame[col].cat.codes
    
    return data_frame

# Gower distance

In [52]:
class GowerDistance:
    def __init__(self, cols_hash, cat_cols, con_cols, W_i, R_i):
        self.cols_hash = cols_hash
        self.cat_cols  = cat_cols
        self.con_cols  = con_cols
        self.W_i       = W_i
        self.R_i       = R_i
        self.W_i_sum   = np.sum(W_i)  # Micro-optimization
    
    @staticmethod
    def cat_dist(c_j, c_k):
        # Categorical distance function

        return int(not c_j == c_k)

    @staticmethod
    def con_dist(x_j, x_k, r_i):
        # Continuous distance function

        return 1 - np.divide(np.absolute(x_j - x_k), r_i)
    
    def __call__(self, X_j, X_k):
        distance = 0

        for col in self.cat_cols:
            distance += np.dot(self.W_i[self.cols_hash[col]], GowerDistance.cat_dist(X_j[self.cols_hash[col]], X_k[self.cols_hash[col]]))

        for col in self.con_cols:
            distance += np.dot(self.W_i[self.cols_hash[col]], GowerDistance.con_dist(X_j[self.cols_hash[col]], X_k[self.cols_hash[col]], self.R_i[self.cols_hash[col]]))

        return distance / self.W_i_sum

In [53]:
class DataFrameTransformer:
    def __init__(self):
        self.cat_cols = None
        self.con_cols = None
    
    def fit(self, X, y=None):
        df = object_cols_to_category(X)

        self.cat_cols = df.select_dtypes(include=['category']).columns.values
        self.con_cols = df.select_dtypes(include=['float64']).columns.values

        df = category_cols_to_codes(df)

    def transform(self, X=None):
        pass

# BallTree

In [54]:
from sklearn.neighbors import BallTree, DistanceMetric


class BallTreePredictor:
    def __init__(self, k=5):
        self.k = k
        
        self.tree = None

    def fit(self, X, y=None):
        # Acá debería guardar los posibles valores de las variables categóricas.
        
        df = object_cols_to_category(X)

        cat_cols = df.select_dtypes(include=['category']).columns.values
        con_cols = df.select_dtypes(include=['float64']).columns.values

        df = category_cols_to_codes(df)

        cols_hash = { col: i for i, col in enumerate(df.columns.values) }

        W_i = [.66 if col in cat_cols else 1 for col in cols_hash]
        R_i = [np.max(df[col]) - np.min(df[col]) if col in con_cols else 1 for col in cols_hash]

        gower_distance = GowerDistance(cols_hash, cat_cols, con_cols, W_i, R_i)
        metric         = DistanceMetric.get_metric('pyfunc', func=gower_distance)
        self.tree      = BallTree(df, metric=metric)
        
        return self
    
    def predict(self, X):
        # Esta parte está mala debido a que no se está teniendo en cuenta todos
        # los otros posibles valores de las características categóricas.
        
        X = object_cols_to_category(X)
        X = category_cols_to_codes(X)
        
        return self.tree.query(X, self.k, return_distance=False)

# Creating pipeline

In [55]:
import json as js
from sklearn.pipeline import Pipeline


raw_json = js.load(open('./../data.json', encoding='utf-8'))

pipeline = Pipeline([('json', JSONTransformer()),('tree', BallTreePredictor())])
pipeline.fit(raw_json)

id [('10', 1), ('3', 2), ('4', 4), ('5', 3), ('6', 6), ('7', 0), ('8', 5), ('9', 7)] {'10': 1, '3': 2, '4': 4, '5': 3, '6': 6, '7': 0, '8': 5, '9': 7} ['10' '3' '4' '5' '6' '7' '8' '9'] 0    1
1    2
2    4
3    3
4    6
5    0
6    5
7    7
dtype: int8
tonalidad [('bicolor_franjas_horizontales', 2), ('bicolor_franjas_verticales', 0), ('monocromatico', 2), ('unicolor', 1)] {'bicolor_franjas_horizontales': 2, 'bicolor_franjas_verticales': 0, 'monocromatico': 2, 'unicolor': 1} ['bicolor_franjas_horizontales' 'bicolor_franjas_verticales'
 'monocromatico' 'unicolor'] 0    2
1    0
2    2
3    1
4    0
5    3
6    2
7    0
dtype: int8
luminosidad [('iluminada', 0), ('oscuro', 0)] {'iluminada': 0, 'oscuro': 0} ['iluminada' 'oscuro'] 0    0
1    0
2    1
3    0
4    0
5    0
6    0
7    1
dtype: int8
lineas [('compleja', 1), ('medio', -1), ('simple', 2)] {'compleja': 1, 'medio': -1, 'simple': 2} ['compleja' 'medio' 'simple'] 0    1
1   -1
2    2
3    0
4    2
5    2
6    2
7    1
dtype: int8


Pipeline(memory=None,
     steps=[('json', <__main__.JSONTransformer object at 0x0B8CAC70>), ('tree', <__main__.BallTreePredictor object at 0x0B8EB810>)])

# Testing pipeline

In [9]:
test = {
    "id": 10,
    "project_id": 5838,
    "photo_id": 11807,
    "tags": [
      "Tonalidad:Unicolor",
      "Tonalidad:Claros",
      "Tonalidad:Cálidos",
      "Tonalidad:Cenizas",
      "Tonalidad:Maderas",
      "Luminosidad:Iluminada",
      "Luminosidad:Claro",
      "Líneas:Simple",
      "Contraste:Bajo",
      "Espacialidad:Aireada (despejada)",
      "Estilo:Vanguardista",
      "Estilo:New rich",
      "Estilo:Minimalista",
      "Estilo:Metálicos",
      "Estilo:Pulcro",
      "Materialidad:Melamina",
      "Percepción de tamaño:XL",
      "Percepción de tamaño:M",
      "Configuración:En L",
      "Textura:Tablero textura",
      "Textura:Cubierta color liso",
      "Cubierta:Cuarzo",
      "Color cubierta:Grises",
      "Espesor cubierta:Gruesa",
      "Visualización:Pesada",
      "Volumetrías:Basal y aéreo",
      "Módulos:Harta puerta",
      "Módulos:Poco cajón",
      "Accesorios:Sin tiradores",
      "Accesorios:Simple (bisagras)"
    ],
    "created_at": "2018-08-27 16:18:43.499832",
    "updated_at": "2018-09-01 20:19:52.274711",
    "technical_tags": {
      "type": "Cocina",
      "zones": [
        "Santiago Oriente"
      ],
      "equipment": "",
      "door_count": "21",
      "drawer_count": "6",
      "countertop_type": "Cuarzo",
      "dimension_basal": "7",
      "estimated_price": "7000000",
      "dimension_aerial": "5",
      "countertop_design": "Melamina Olmo Alpino",
      "dimension_countertop": "4"
    },
    "numerical_tags": {
      "Ejecución": 3,
      "Diseño y creatividad": 4,
      "Exigencias técnicas": 4,
      "Valor casa": 12000,
      "Precio estimado": 1
    },
    "photo_url": "https://res.cloudinary.com/hylemqjoq/image/upload/v1515173720/vsgjz5y70rj7nfasmoqd.jpg"
}

prediction = pipeline.predict([test])

col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8
col: 0    0
dtype: int8


In [33]:
prediction

array([[4, 5, 1, 7, 3]], dtype=int32)