# Sk learn classes

Clases para generar un pipeline de sk.

## JSON preprocessor


In [10]:
# dependencies
import json
from functools import reduce
from nltk import word_tokenize

In [11]:

'''
transformer object that turns the data json into arrays of tokenized words.
'''

class JsonTransform():
    
    '''
    add category determines if tag category is added to the final arrays.
    '''
    def __init__(self,add_category=False):
        self.add_category = add_category
        
    '''
    returns arrays of word arrays from tags
    '''    

    
    def fit(self,X=None,y=None):
        return self
    
    
    def transform(self,X):
        return [ self.tag_array_to_word_array(instance) for instance in self.data_to_tags(data)]
    
    
    '''
    returns tag as array of words (no ':').
    add_category determines if the tag category is added to the array.
    '''
    def tag_to_words(self,tag):
        category, text = tag.split(":")
        if self.add_category:
            return word_tokenize("{} {}".format(category,text).lower())
        return word_tokenize(text.lower())

    '''
    transforms array of tags into array of words.
    add_category determines if the tag categories are added to the text.
    '''
    def tag_array_to_word_array(self,tags):
        aux_tags = list(tags)
        aux_tags[0] = self.tag_to_words(tags[0])
        return reduce(lambda x,y : x + self.tag_to_words(y),aux_tags)

    '''
    returns array of tag arrays from original json.
    '''
    def data_to_tags(self,data):
        return [instance["tags"] for instance in data]


In [4]:
# Test
#constants
DATA_PATH="data.json"
data = json.load(open(DATA_PATH))
json_transformer = JsonTransform(False)
documents = json_transformer.transform(data)

## Tf-Idf Glove Transformer


In [5]:
# dependencies

import numpy as np
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models.keyedvectors import KeyedVectors

In [20]:
'''
Calculates embedding, mapping a tokenized document to a vector.
To calculate the vector we use a weighted average of precomputed Glove Vectors. 
Weights of the average are given by TFIDF scores.
'''

class TfIdfGloveTransformer():
    
    '''
    word_embedder is pretrained gensim.KeyedVectors model
    
    dim is the dimension on word_embedder
    '''
    def __init__(self,word_embedder,dim=300):
        self.word_embedder = word_embedder
        self.dim=dim
        self.word_dict = corpora.Dictionary(documents,prune_at=None)
        self.bows = None
        self.tfidf = None
        self.token2id = None
        
    '''
    Fits from corpus of tokenized documents.
    '''
    def fit(self,X,y=None):
        self.bows = [self.word_dict.doc2bow(doc) for doc in X]
        self.tfidf = TfidfModel(self.bows,normalize=True)
        self.token2id = self.word_dict.token2id
        return self
    
    
    '''
    returns embedding representation of documents in X
    '''
    
    def transform(self,X):
        new_bows = [self.word_dict.doc2bow(doc) for doc in X]
        result = np.zeros((len(X),self.dim))
        # perhaps this can be implemented better in a vectorial way
        for i, (doc,bow) in enumerate(zip(X,new_bows)):
            score_hash = { tup[0]:tup[1] for tup in self.tfidf.__getitem__(bow,-1)} # threshold
            weighted_embeddings = np.array([np.dot(model[word],score_hash[self.token2id[word]]) if word in model else np.zeros((1,self.dim)) for word in doc])
            result[i] = np.sum(weighted_embeddings, axis=0)
        return result
            

In [11]:
# Test
# constants
GLOVE_PATH= "glove-sbwc.i25.vec"
DATA_PATH="data.json"
vectors = 855380
model=KeyedVectors.load_word2vec_format(GLOVE_PATH,limit=vectors)

In [4]:
data = json.load(open(DATA_PATH))
json_transformer = JsonTransform(False)
documents = json_transformer.transform(data)
tfidf = TfIdfGloveTransformer(model).fit(documents)
result = tfidf.transform(documents)
print(result.shape)

NameError: name 'json' is not defined

## Tf-Idf LDA Transformer

In [6]:
# dependencies

import numpy as np
from gensim import corpora
from gensim.models import TfidfModel, LdaModel

In [94]:
"""
Generates doc embeddings baed on topic modelling.
Does Tf-Idf transformation and then computes probability distibutions with LDA algorithm.
"""

class LdaTransformer():
    """
    dim: amount of topics to model. aka output vector dimension.
    """
    def __init__(self,dim=20):
        self.dim=dim
        self.word_dict = corpora.Dictionary(documents,prune_at=None)
        self.bows = None
        self.tfidf = None
        self.token2id = None
        self.lda = None
    
    def fit(self,X,y=None):
        self.bows = [self.word_dict.doc2bow(doc) for doc in X]
        self.tfidf = TfidfModel(self.bows,normalize=True)
        self.token2id = self.word_dict.token2id
        self.lda = LdaModel(self.tfidf[self.bows],num_topics=self.dim,minimum_probability=0)
        return self
    
    """
    receives tokenized documents and returns the distribution of each.
    """
    def transform(self,X):
        new_bows = [self.word_dict.doc2bow(doc) for doc in X]
        distributions = np.array(self.lda[self.tfidf[new_bows]])
        return np.reshape(np.delete(distributions,np.s_[:1],2),(len(X),self.dim))

In [87]:
#test
DATA_PATH="data.json"

data = json.load(open(DATA_PATH))
json_transformer = JsonTransform(False)
documents = json_transformer.transform(data)
lda = LdaTransformer().fit(documents)

## kNeighbours

Wrapper of sklearn balltree to put in the pipeline

In [89]:
# dependencies
from sklearn.neighbors import BallTree

In [90]:

'''
wrapper for sklearn BallTree that can be added to a pipeline
'''

class BallTreePredictor():
    
    def __init__(self,k=5):
        self.tree = None
        self.k=k
        
    def set_neighbors(self,k):
        self.k = k
        
    def fit(self,X,y=None):
        self.tree = BallTree(X)
        return self
        
    def predict(self,X):
        return self.tree.query(X,self.k)
    

In [33]:
#test

tree = BallTreePredictor(k=2).fit(result)
tree.predict(result[0:5])

(array([[0.        , 7.66705467],
        [0.        , 8.68959395],
        [0.        , 7.78825885],
        [0.        , 7.66705467],
        [0.        , 7.78825885]]), array([[0, 3],
        [1, 3],
        [2, 4],
        [3, 0],
        [4, 2]]))

## Pipeline Test

In [92]:
from sklearn.pipeline import Pipeline

In [95]:
pipe = Pipeline([("json",JsonTransform()),("embedder",LdaTransformer()),("tree",BallTreePredictor())])
pipe.fit(data) # fit and predict directly on json files
pipe.predict(data[0])

(array([[4.36829008e-09, 6.04630833e-02, 1.08262327e+00, 1.10715184e+00,
         1.12590370e+00],
        [5.96046448e-08, 1.08262331e+00, 1.11142600e+00, 1.12710288e+00,
         1.13011397e+00],
        [4.05954099e-09, 6.04630832e-02, 1.12710284e+00, 1.15077592e+00,
         1.16889645e+00],
        [9.31322575e-10, 2.50989367e-02, 1.10715185e+00, 1.11142595e+00,
         1.15077592e+00],
        [4.26785619e-09, 2.50989376e-02, 1.12590370e+00, 1.13011393e+00,
         1.16889645e+00]]), array([[0, 2, 1, 3, 4],
        [1, 0, 3, 2, 4],
        [2, 0, 1, 3, 4],
        [3, 4, 0, 1, 2],
        [4, 3, 0, 1, 2]]))