In [None]:
from sklearn import neighbors
import json
import numpy as np
import pandas as pd
import unidecode

In [None]:
class JSONTransformer:
    @staticmethod
    def clear(string):
        string = unidecode.unidecode(string.lower().strip().replace(' ', '_'))

        return float(string) if string.replace('.', '', 1).isdigit() else string

    @staticmethod
    def preprocess_json(json):
        new_json = []

        for instance in json:    
            new_instance = {}

            for (key, value) in map(lambda t: t.split(':'), instance['tags']):
                key = JSONTransformer.clear(key)

                if key not in new_instance:
                    new_instance[key] = JSONTransformer.clear(value)

            new_json.append(new_instance)

        return new_json

    @staticmethod
    def json_to_df(json):
        df_dict = {
            key: [None] * len(json)
            for instance in json
            for key in instance
        }

        for index, instance in enumerate(json):
            for key, value in instance.items():
                df_dict[key][index] = value

        return pd.DataFrame.from_dict(df_dict)

    def fit(self, X=None, y=None):
        return self
    
    def transform(self, X=None):
        preprocessed_json = JSONTransformer.preprocess_json(X)
        
        return JSONTransformer.json_to_df(preprocessed_json)

In [None]:
def object_cols_to_category(data_frame):
    # Set <object> dtype columns to <category>
    
    category_cols = data_frame.select_dtypes(include=['object']).columns.values
    
    for col in category_cols:
        data_frame[col] = data_frame[col].astype('category')

    return data_frame


def category_cols_to_codes(data_frame):
    # Set <category> dtype columns values to <int8>
    
    category_cols = data_frame.select_dtypes(include=['category']).columns.values
    
    for col in category_cols:
        data_frame[col] = data_frame[col].cat.codes
    
    return data_frame

In [None]:
class GowerDistance:
    def __init__(self, cols_hash, cat_cols, con_cols, W_i, R_i):
        self.cols_hash = cols_hash
        self.cat_cols = cat_cols
        self.con_cols = con_cols
        self.W_i = W_i
        self.R_i = R_i
        self.W_i_sum = np.sum(W_i)
    
    @staticmethod
    def cat_dist(c_j, c_k):
        # Categorical distance function

        return int(not c_j == c_k)

    @staticmethod
    def con_dist(x_j, x_k, r_i):
        # Continuous distance function

        return 1 - np.divide(np.absolute(x_j - x_k), r_i)
    
    def __call__(self, X_j, X_k):
        distance = 0

        for col in self.cat_cols:
            distance += np.dot(self.W_i[self.cols_hash[col]], GowerDistance.cat_dist(X_j[self.cols_hash[col]], X_k[self.cols_hash[col]]))

        for col in self.con_cols:
            distance += np.dot(self.W_i[self.cols_hash[col]], GowerDistance.con_dist(X_j[self.cols_hash[col]], X_k[self.cols_hash[col]], self.R_i[self.cols_hash[col]]))

        return distance / self.W_i_sum

In [None]:
class BallTreePredictor:
    def __init__(self, k=5):
        self.k = k
        
        self.tree = None

    def fit(self, X, y=None):
        df = object_cols_to_category(X)

        cat_cols = df.select_dtypes(include=['category']).columns.values
        con_cols = df.select_dtypes(include=['float64']).columns.values

        df = category_cols_to_codes(df)

        cols_hash = { col: i for i, col in enumerate(df.columns.values) }

        W_i = [.66 if col in cat_cols else 1 for col in cols_hash]
        R_i = [np.max(df[col]) - np.min(df[col]) if col in con_cols else 1 for col in cols_hash]

        gower_distance = GowerDistance(cols_hash, cat_cols, con_cols, W_i, R_i)
        metric = neighbors.DistanceMetric.get_metric('pyfunc', func=gower_distance)
        self.tree = neighbors.BallTree(df, metric=metric)
        
        return self
    
    def predict(self, X):
        return self.tree.query(X, self.k)

In [None]:
from sklearn.pipeline import Pipeline

raw_json = json.load(open('./../data.json', encoding='utf-8'))

pipeline = Pipeline([('json', JSONTransformer()),('tree', BallTreePredictor())])
pipeline.fit(raw_json)

In [None]:
print(raw_json[0])

pipeline.predict([raw_json[0]])