# Sk learn classes

Clases para generar un pipeline de sk.

## JSON preprocessor


In [1]:
# dependencies
import json
from functools import reduce
from nltk import word_tokenize

In [2]:

'''
transformer object that turns the data json into arrays of tokenized words.
'''

class JsonTransform():
    
    '''
    add category determines if tag category is added to the final arrays.
    '''
    def __init__(self,add_category=False):
        self.add_category = add_category
        
    '''
    returns arrays of word arrays from tags
    '''    

    
    def fit(self,X=None,y=None):
        return self
    
    
    def transform(self,X):
        return [ self.process_tag_array(instance) for instance in self.data_to_tags(data)]
    
    
    '''
    returns tag as array of words (no ':').
    add_category determines if the tag category is added to the array.
    '''
    def tag_to_words(self,tag):
        tag = tag.lower()
        category, text = tag.split(":")
        if self.add_category:
            return word_tokenize("{} {}".format(category,text).lower())
        return word_tokenize(text.lower())

    '''
    transforms array of tags into array of words.
    add_category determines if the tag categories are added to the text.
    '''
    def process_tag_array(self,tags):
        aux_tags = list(tags)
        aux_tags[0] = self.tag_to_words(tags[0])
        return reduce(lambda x,y : x + self.tag_to_words(y),aux_tags)

    '''
    returns array of tag arrays from original json.
    '''
    def data_to_tags(self,data):
        return [instance["tags"] for instance in filter(lambda x: x["tags"],data)]


In [4]:
# Test
#constants
DATA_PATH="../data.json"
data = json.load(open(DATA_PATH))
json_transformer = JsonTransform(False)
documents = json_transformer.transform(data)

## Json To Tags preprocessor

Similar To JsonTransformer but returns arrays of tags instead of tokenized words

In [10]:
class JsonToTagsTransform(JsonTransform):
    
    def process_tag_array(self,tags):
        return [tag.lower() for tag in tags]

In [11]:
# Test
#constants
DATA_PATH="data.json"
data = json.load(open(DATA_PATH))
json_transformer = JsonToTagsTransform(False)
json_transformer.transform(data)

[['tonalidad:monocromático',
  'tonalidad:unicolor',
  'tonalidad:claros',
  'luminosidad:iluminada',
  'luminosidad:claro',
  'líneas:medio',
  'contraste:bajo',
  'espacialidad:compacta',
  'espacialidad:apretada',
  'estilo:sin estilo',
  'materialidad:melamina',
  'percepción de tamaño:s',
  'configuración:en l',
  'textura:tablero liso',
  'textura:cubierta texturas',
  'cubierta:granito',
  'color cubierta:colores',
  'espesor cubierta:gruesa',
  'visualización:pesada',
  'volumetrías:basal y aéreo',
  'módulos:sin puerta elevable',
  'módulos:harta puerta',
  'módulos:poco cajón',
  'accesorios:tiradores simples',
  'accesorios:simple (bisagras)'],
 ['tonalidad:bicolor franjas horizontales',
  'tonalidad:unicolor',
  'tonalidad:multicolor',
  'tonalidad:clásicos',
  'tonalidad:oscuros',
  'tonalidad:claros',
  'tonalidad:maderas',
  'luminosidad:iluminada',
  'luminosidad:oscuro',
  'contraste:alto',
  'espacialidad:alta',
  'estilo:clásico',
  'estilo:familiar',
  'estilo:tosca

## Tf-Idf Glove Transformer


In [5]:
# dependencies

import numpy as np
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models.keyedvectors import KeyedVectors

In [6]:
'''
Calculates embedding, mapping a tokenized document to a vector.
To calculate the vector we use a weighted average of precomputed Glove Vectors. 
Weights of the average are given by TFIDF scores.
'''

class TfIdfGloveTransformer():
    
    '''
    word_embedder is pretrained gensim.KeyedVectors model
    
    dim is the dimension on word_embedder
    '''
    def __init__(self,word_embedder,dim=300):
        self.word_embedder = word_embedder
        self.dim=dim
        self.word_dict = None
        self.bows = None
        self.tfidf = None
        self.token2id = None
        
    '''
    Fits from corpus of tokenized documents.
    '''
    def fit(self,X,y=None):
        self.word_dict = corpora.Dictionary(X,prune_at=None)
        self.bows = [self.word_dict.doc2bow(doc) for doc in X]
        self.tfidf = TfidfModel(self.bows,normalize=True)
        self.token2id = self.word_dict.token2id
        return self
    
    
    '''
    returns embedding representation of documents in X
    '''
    
    def transform(self,X):
        new_bows = [self.word_dict.doc2bow(doc) for doc in X]
        result = np.zeros((len(X),self.dim))
        # perhaps this can be implemented better in a vectorial way
        for i, (doc,bow) in enumerate(zip(X,new_bows)):
            score_hash = { tup[0]:tup[1] for tup in self.tfidf.__getitem__(bow,-1)} # threshold
            weighted_embeddings = np.array([np.dot(model[word],score_hash[self.token2id[word]]) if word in model else np.zeros((1,self.dim)) for word in doc])
            result[i] = np.sum(weighted_embeddings, axis=0)
        return result
            

In [8]:
# Test
# constants
GLOVE_PATH= "glove-sbwc.i25.vec"
DATA_PATH="../data.json"
vectors = 855380
model=KeyedVectors.load_word2vec_format(GLOVE_PATH,limit=vectors)

In [9]:
data = json.load(open(DATA_PATH))
json_transformer = JsonTransform(False)
documents = json_transformer.transform(data)
tfidf = TfIdfGloveTransformer(model).fit(documents)
result = tfidf.transform(documents)
print(result.shape)

(216, 300)


In [19]:
word_hash = {}
for word in tfidf.word_dict.token2id.keys():
    if word in model:
        word_hash[word] =  model[word]
    else:
        print(word)

(
)
postformado
desaturado


In [23]:
import pickle
with open("word_hash","wb") as file:
    pickle.dump(word_hash, file)

In [29]:
with open("../word_hash","rb") as file:
    word_hash = pickle.load(file)
word_hash["puerta"]

array([ 5.773290e-01,  7.781000e-03, -1.859380e-01, -1.262670e-01,
       -1.129880e-01,  3.057520e-01,  7.133700e-02,  4.095100e-01,
        9.069800e-02, -2.276500e-02,  1.068800e-02,  3.541600e-02,
        3.348530e-01, -1.530170e-01, -1.678720e-01,  1.030600e-01,
        1.791380e-01, -3.583900e-02, -1.279320e-01, -2.860120e-01,
       -2.579240e-01, -5.553100e-02, -2.381400e-02, -2.162200e-02,
       -3.663550e-01,  2.766150e-01,  2.496960e-01,  4.906480e-01,
       -1.022730e-01,  8.008110e-01, -1.527730e-01, -6.266740e-01,
        6.844200e-02, -2.942780e-01, -2.784940e-01, -4.767060e-01,
        6.741700e-02,  1.959190e-01, -3.368140e-01, -3.359520e-01,
        2.660420e-01,  7.222400e-01, -2.985280e-01, -4.650600e-01,
        5.339550e-01, -1.286800e-01, -4.107000e-01,  3.115910e-01,
       -4.419170e-01,  6.064880e-01,  2.871180e-01,  3.168780e-01,
        2.407450e-01,  1.846570e-01,  3.651000e-03,  1.582030e-01,
       -1.785390e-01, -4.708830e-01, -6.253270e-01, -2.215600e

## Tf-Idf LDA Transformer

In [14]:
# dependencies

import numpy as np
from gensim import corpora
from gensim.models import TfidfModel, LdaModel

In [32]:
"""
Generates doc embeddings baed on topic modelling.
Does Tf-Idf transformation and then computes probability distibutions with LDA algorithm.
"""

class LdaTransformer():
    """
    dim: amount of topics to model. aka output vector dimension.
    """
    def __init__(self,dim=20):
        self.dim=dim
        self.word_dict = None
        self.bows = None
        self.tfidf = None
        self.token2id = None
        self.lda = None
    
    def fit(self,X,y=None):
        self.word_dict = corpora.Dictionary(X,prune_at=None)
        self.bows = [self.word_dict.doc2bow(doc) for doc in X]
        self.tfidf = TfidfModel(self.bows,normalize=True)
        self.token2id = self.word_dict.token2id
        self.lda = LdaModel(self.tfidf[self.bows],num_topics=self.dim,minimum_probability=0)
        return self
    
    """
    receives tokenized documents and returns the distribution of each.
    """
    def transform(self,X):
        new_bows = [self.word_dict.doc2bow(doc) for doc in X]
        distributions = np.array(self.lda[self.tfidf[new_bows]])
        return np.reshape(np.delete(distributions,np.s_[:1],2),(len(X),self.dim))

In [16]:
#test
DATA_PATH="data.json"

data = json.load(open(DATA_PATH))
json_transformer = JsonTransform(False)
documents = json_transformer.transform(data)
lda = LdaTransformer().fit(documents)

## kNeighbours

Wrapper of sklearn balltree to put in the pipeline

In [27]:
# dependencies
from sklearn.neighbors import BallTree

In [28]:

'''
wrapper for sklearn BallTree that can be added to a pipeline
'''

class BallTreePredictor():
    
    def __init__(self,k=5):
        self.tree = None
        self.k=k
        
    def set_neighbors(self,k):
        self.k = k
        
    def fit(self,X,y=None):
        self.tree = BallTree(X)
        return self
        
    def predict(self,X):
        return self.tree.query(X,self.k)
    

In [29]:
#test

tree = BallTreePredictor(k=2).fit(result)
tree.predict(result[0:5])

NameError: name 'result' is not defined

## Pipeline Test

In [30]:
from sklearn.pipeline import Pipeline

In [33]:
pipe = Pipeline([("json",JsonToTagsTransform()),("embedder",LdaTransformer()),("tree",BallTreePredictor())])
pipe.fit(data) # fit and predict directly on json files
pipe.predict(data[0])

(array([[8.11908197e-09, 2.07296441e-03, 1.41270945e-02, 8.17489231e-01,
         9.75248843e-01],
        [0.00000000e+00, 2.07296623e-03, 1.62000589e-02, 8.19006014e-01,
         9.76745688e-01],
        [6.03206639e-08, 1.01501406e+00, 1.07125076e+00, 1.13166617e+00,
         1.13304041e+00],
        [1.61327600e-04, 8.07379661e-01, 8.17650552e-01, 8.19167334e-01,
         9.05632820e-01],
        [1.25462667e-07, 1.02403689e+00, 1.07981334e+00, 1.13976369e+00,
         1.14112914e+00],
        [6.71852376e-04, 9.05991325e-01, 9.65776475e-01, 9.75920683e-01,
         9.77417534e-01],
        [0.00000000e+00, 1.41270927e-02, 1.62000589e-02, 8.07218334e-01,
         9.65104707e-01],
        [4.05954099e-09, 1.00560674e+00, 1.06233136e+00, 1.12323865e+00,
         1.13304046e+00]]), array([[0, 1, 6, 3, 5],
        [1, 0, 6, 3, 5],
        [2, 3, 5, 6, 7],
        [3, 6, 0, 1, 5],
        [4, 3, 5, 6, 7],
        [5, 3, 6, 0, 1],
        [6, 0, 1, 3, 5],
        [7, 3, 5, 6, 2]]))