# Sk learn classes

Clases para generar un pipeline de sk.

## JSON preprocessor


In [2]:
# dependencies
import json
from functools import reduce
from nltk import word_tokenize

In [36]:

'''
transformer object that turns the data json into arrays of tokenized words.
'''

class JsonTransform():
    
    '''
    add category determines if tag category is added to the final arrays.
    '''
    def __init__(self,add_category=False):
        self.add_category = add_category
        
    '''
    returns arrays of word arrays from tags
    '''    

    
    def fit(self,X=None,y=None):
        return self
    
    
    def transform(self,X):
        return [ self.tag_array_to_word_array(instance) for instance in self.data_to_tags(data)]
    
    
    '''
    returns tag as array of words (no ':').
    add_category determines if the tag category is added to the array.
    '''
    def tag_to_words(self,tag):
        category, text = tag.split(":")
        if self.add_category:
            return word_tokenize("{} {}".format(category,text).lower())
        return word_tokenize(text.lower())

    '''
    transforms array of tags into array of words.
    add_category determines if the tag categories are added to the text.
    '''
    def tag_array_to_word_array(self,tags):
        aux_tags = list(tags)
        aux_tags[0] = self.tag_to_words(tags[0])
        return reduce(lambda x,y : x + self.tag_to_words(y),aux_tags)

    '''
    returns array of tag arrays from original json.
    '''
    def data_to_tags(self,data):
        return [instance["tags"] for instance in data]


In [4]:
# Test
#constants
DATA_PATH="data.json"
data = json.load(open(DATA_PATH))
json_transformer = JsonTransform(False)
documents = json_transformer.transform(data)

## Embedding Transformer


In [5]:
# dependencies

import numpy as np
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models.keyedvectors import KeyedVectors

In [20]:
'''
Calculates embedding, mapping a tokenized document to a vector.
To calculate the vector we use a weighted average of precomputed Glove Vectors. 
Weights of the average are given by TFIDF scores.
'''

class TfIdfGloveTransformer():
    
    '''
    word_embedder is pretrained gensim.KeyedVectors model
    
    dim is the dimension on word_embedder
    '''
    def __init__(self,word_embedder,dim=300):
        self.word_embedder = word_embedder
        self.dim=dim
        self.word_dict = corpora.Dictionary(documents,prune_at=None)
        self.bows = None
        self.tfidf = None
        self.token2id = None
        
    '''
    Fits from corpus of tokenized documents.
    '''
    def fit(self,X,y=None):
        self.bows = [self.word_dict.doc2bow(doc) for doc in X]
        self.tfidf = TfidfModel(self.bows,normalize=True)
        self.token2id = self.word_dict.token2id
        return self
    
    
    '''
    returns embedding representation of documents in X
    '''
    
    def transform(self,X):
        new_bows = [self.word_dict.doc2bow(doc) for doc in X]
        result = np.zeros((len(X),self.dim))
        # perhaps this can be implemented better in a vectorial way
        for i, (doc,bow) in enumerate(zip(X,new_bows)):
            score_hash = { tup[0]:tup[1] for tup in self.tfidf.__getitem__(bow,-1)} # threshold
            weighted_embeddings = np.array([np.dot(model[word],score_hash[self.token2id[word]]) if word in model else np.zeros((1,self.dim)) for word in doc])
            result[i] = np.sum(weighted_embeddings, axis=0)
        return result
            

In [11]:
# Test
# constants
GLOVE_PATH= "glove-sbwc.i25.vec"
vectors = 855380
model=KeyedVectors.load_word2vec_format(GLOVE_PATH,limit=vectors)

In [22]:
tfidf = TfIdfGloveTransformer(model).fit(documents)
result = tfidf.transform(documents)
print(result.shape)

(5, 300)


array([-1.20531225e-01, -1.32767827e-01, -2.40587811e-01, -4.50649503e-01,
        5.71917176e-01,  7.97919041e-01,  2.06611624e-01, -2.39433183e-01,
       -3.97522111e-01,  6.60949659e-01, -4.01172718e-02, -3.52866864e-01,
       -3.64130336e-01, -1.29405238e-01, -6.71148494e-01,  3.34426119e-01,
       -3.54335664e-01, -4.58082952e-02,  4.45215941e-01,  2.09159656e-01,
        6.29068147e-01, -4.20855797e-01, -3.75286537e-01,  1.77885970e-01,
        9.77248454e-01,  1.76781183e-02,  4.89480383e-01,  7.42718880e-01,
       -5.76446240e-02,  3.31525399e-01, -2.88255258e-01, -2.26693455e-02,
        5.21837659e-01, -1.96507813e-01, -5.47155233e-01,  1.25160180e-02,
       -2.66761573e-01,  1.52969447e-01, -6.37570491e-01, -5.42342140e-02,
        6.28757097e-01,  7.46052316e-01, -1.88470703e-01,  3.21771419e-02,
        5.75350987e-02, -1.38833109e-01,  1.68983996e-01, -2.25347807e-01,
       -7.10564686e-01,  3.80240281e-01, -3.97450305e-01,  3.92858050e-02,
        1.96420556e-01, -

## kNeighbours

Wrapper of sklearn balltree to put in the pipeline

In [25]:
# dependencies
from sklearn.neighbors import BallTree

In [28]:

'''
wrapper for sklearn BallTree that can be added to a pipeline
'''

class BallTreePredictor():
    
    def __init__(self,k=5):
        self.tree = None
        self.k=k
        
    def set_neighbors(self,k):
        self.k = k
        
    def fit(self,X,y=None):
        self.tree = BallTree(X)
        return self
        
    def predict(self,X):
        return self.tree.query(X,self.k)
    

In [33]:
#test

tree = BallTreePredictor(k=2).fit(result)
tree.predict(result[0:5])

(array([[0.        , 7.66705467],
        [0.        , 8.68959395],
        [0.        , 7.78825885],
        [0.        , 7.66705467],
        [0.        , 7.78825885]]), array([[0, 3],
        [1, 3],
        [2, 4],
        [3, 0],
        [4, 2]]))

## Pipeline Test

In [34]:
from sklearn.pipeline import Pipeline

In [37]:
pipe = Pipeline([("json",JsonTransform()),("embedder",TfIdfGloveTransformer(model)),("tree",BallTreePredictor())])
pipe.fit(data) # fit and predict directly on json files
pipe.predict(data[0])