In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models.phrases import Phrases, Phraser
from ipynb.fs.defs.preprocess import lemmatizer, remove_stopwords, sentence_tokenizer
from pandas import DataFrame
from collections import defaultdict
from nltk import word_tokenize
import gensim
import numpy as np
import dill
import pickle

In [None]:
class Text_preprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, func=None):
        self.clean_func = remove_stopwords
        return None

    def fit(self,X,y=None):
        print("fit Text_preprocessor")
        return self
    
    def transform(self, X):
        print("Start transform Text_preprocessor")
        print(type(X))
        print(self.clean_func)
        function = dill.loads(dill.dumps(lambda x: self.clean_func(x)))
        X_cleaned = X.apply(function)
        return X_cleaned

# Ejemplo
#preproc = Text_preprocessor(preprocess)
#preproc.fit_transform(df.raw_text)
#preproc.transform(df.raw_text)

In [None]:
class Text_preprocessor_sentence_tokenize(BaseEstimator, TransformerMixin):
    
    def __init__(self,func=None):
        self.clean_func = sentence_tokenizer
        return None

    def fit(self,X,y=None):
        print("fit Text_preprocessor_sentence_tokenizer")
        print(self.clean_func)
        return self
    
    def transform(self, X):
        print("Start transform Text_preprocessor_sentence_tokenizer")
        print(type(X))
        print(self.clean_func)
        function = dill.loads(dill.dumps(lambda x: self.clean_func(x)))
        X_cleaned = X.apply(function)
        return X_cleaned

In [None]:
class Text_preprocessor_lemmatizer(BaseEstimator, TransformerMixin):
    
    def __init__(self,func=None):
        self.clean_func = lemmatizer
        return None

    def fit(self,X,y=None):
        print("fit Text_preprocessor_lemmatizer")
        print(self.clean_func)
        return self
    
    def transform(self, X):
        print("Start transform Text_preprocessor_lemmatizer")
        print(type(X))
        print(self.clean_func)
        function = dill.loads(dill.dumps(lambda x: self.clean_func(x)))
        X_cleaned = X.apply(function)
        return X_cleaned

In [None]:
# Estimador que crea nuevos features a partir del texto
# Como entrada necesita un diccionario con referencia a las funciones que procecen texto y devuelvan un valor por cada documento
# A modo de ejemplo cree funcione anonimas lambda, pero se podria utilizar con funciones convencionales que devuelvan un valor
# el x de las funciones lambda son los subtitulos de cada pelicula.

func_dict = {
            "largo_char_count" : dill.loads(dill.dumps(lambda x: len(x))),
            "cant_palabras": dill.loads(dill.dumps(lambda x: len(x.split(' ')))),
            "letras_x_palabras" : dill.loads(dill.dumps(lambda x: 1.0*len(x)/len(x.split(' ')))),
            "vocales_num":  dill.loads(dill.dumps(lambda x: len([l for l in x if l in "aeiou"]))),
            }

class Text_extrafeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, func_dict):
        self.func_dict = func_dict
        self.func_dict_keys = list(func_dict.keys())
        return None
    def fit(self,X,y=None):
        return self
    def get_feature_names(self):
        return self.func_dict_keys
    def transform(self, X):
        print("Start transform EXTRA FEATURES")
        f = dill.dumps(lambda x, k: self.func_dict[k](x))
        function = dill.loads(f)
        result = {}
        for k in self.func_dict_keys:
            result[k] = X.apply(function,args=(k,))
        return DataFrame(result).values

#Ejemplo
#extra_feat = Text_extrafeatures(func_dict)
#res = extra_feat.fit_transform(Series(texts))
#DataFrame(res, columns= extra_feat.get_feature_names())

In [None]:
#a version that uses tf-idf weighting scheme for good measure
class TfidfEmbeddingVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, tfidfv):
        self.word2vec = None
        self.word2weight = None
        self.dim = 0 #len(word2vec.get(list(w2v.keys())[0]))
        self.tfidf = tfidfv
        
    def fitTfidfVectorizer(self, X):
        self.tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(self.tfidf.idf_)
        self.word2weight = defaultdict(
            dill.dumps(lambda: max_idf,
            [(w, self.tfidf.idf_[i]) for w, i in self.tfidf.vocabulary_.items()]))
    
    def fitW2V(self,X):
        sentences=[[word for word in word_tokenize(doc)]for doc in X]
        self.model = gensim.models.Word2Vec(
            sentences,
            size=50,#100 por defecto
            #window=10, #5 por defecto
            min_count=1) #elimina todas las palabras que tengan menos de esta frecuencia, por defecto 5
            #workers=10) #para procesamiento en paralelo
        self.model.train(sentences, total_examples=len(sentences), epochs=10)
        self.word2vec = dict(zip(self.model.wv.index2word, self.model.wv.vectors))
        self.dim = len(self.word2vec.get(list(self.word2vec.keys())[0]))
    
    def fit(self, X, y=None):
        self.fitTfidfVectorizer(X)
        self.fitW2V(list(X))
        return self

    def transform(self, X):
        print("EMBEDDING")
        return np.array([
                np.mean([self.word2vec[word] * self.word2weight[word]
                         for word in doc if word in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for doc in X
            ])
    

In [None]:
class CollocationGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, min_count=5,threshold=0.5,scoring='npmi'):
        self.gensim_model=None
        self.min_count=min_count
        self.threshold=threshold
        self.scoring=scoring
        return None
    
    def fit(self, X, y=None):
        sentences=[item for sublist in X for item in sublist]
        collocations = Phrases(sentences=sentences, min_count=self.min_count,threshold=self.threshold,scoring=self.scoring)
        self.gensim_model = Phraser(collocations)
        return self
           
    def transform(self, X):
        print("Start transform COLLOCATIONS")
        f = dill.dumps(lambda x: self.subtitle(self.gensim_model[x]))
        function = dill.loads(f)
        X_cleaned = X.apply(function)
        return X_cleaned
    
    def flat_list(self, X):
        return [item for sublist in X for item in sublist]
    
    def subtitle(self, X):
        return ' '.join(self.flat_list(X))
    