In [1]:
import progressbar
from random import random, choice
from tokenizer.custom_tokenizer import CustomToken
from sklearn.neighbors import KDTree
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Node:
    def __init__(self, value, p_space = 0.9):
        self.value = value
        self.input_edges = {}
        self.in_degree = 0
        self.out_degree = 0
        self.output_edges = {}
        self.types = {}
        self.len_types = 0
        self.cluster = 0
        self.p_space = p_space

    def add_type(self, type_tuple):
        self.len_types += 1
        try:
            self.types[type_tuple] += 1
        except KeyError:
            self.types[type_tuple] = 1

    def add_edge(self, n: "Node", out= False):
        if out:
            self.out_degree += 1
            try:
                self.output_edges[n.value] += 1
            except KeyError:
                self.output_edges[n.value] = 1
        else:
            self.in_degree += 1
            try:
                self.input_edges[n.value] += 1
            except KeyError:
                self.input_edges[n.value] = 1
    
    def __str__(self):
        alpha = random()
        return (' ' if self.p_space > alpha else '') + self.value


def __token_struct__(t: CustomToken):
    type_ = t.syntax[2]
    spacing = 0.95
    if t.is_emoji(): type_, spacing = 'emoji', 0.35
    if t.is_url(): type_ = 'url'
    if t.is_symbol: type_, spacing  = 'symbol', 0.2
    
    return (type_, t.syntax[3], t.text == '.'), spacing

def sampler(list):
    _len = sum([value for _, value in list])
    list.sort(key=lambda x:x[1], reverse=True)
    x = random()
     
    result, pivot = list[0]
    for i in range(1, len(list)):
        key, value = list[i]
        if x < (pivot + value)/_len: return result
        result, pivot = key, pivot + value
    
    return list[-1][0]

In [3]:
class StructureSelectByMarkovModel:
    def __init__(self, p=1, k=1000000):
        """
            params:
                p: density of probability in that search next step 
                k: length of list in that search next step 
                sent_length: count of sent generate 
        """
        
        self.p = p
        self.k = k
        self.structure_graph = {}


    def __get_structure_node__(self, t: CustomToken) -> Node:
        pivot_struct, _ = __token_struct__(t)
        value = (pivot_struct, self.__sent_index)
        try: 
            return self.structure_graph[pivot_struct][self.__sent_index]
        except KeyError: 
            try:
                self.structure_graph[pivot_struct][self.__sent_index] = Node(value)
                return self.structure_graph[pivot_struct][self.__sent_index]
            except KeyError:
                self.structure_graph[pivot_struct] = {}
                self.structure_graph[pivot_struct][self.__sent_index] = Node(value)
                return self.structure_graph[pivot_struct][self.__sent_index]
    
    def fit(self, X: list[CustomToken]):
        self.__current_sent = X[0].sent
        self.__sent_index = 0
        self.__previous_words = []
        
        pivot_struct =  self.__get_structure_node__(X[0])
        self.__sent_index = 1
        for i in range(1, len(X)):
            if X[i].sent != self.__current_sent or self.__sent_index > len(self.__current_sent):
                self.__previous_words = []
                self.__sent_index = 0

            new_struct = self.__get_structure_node__(X[i])
            
            pivot_struct.add_edge(new_struct, out=True)
            new_struct.add_edge(pivot_struct)


            pivot_struct = new_struct
            self.__sent_index += 1
            self.__previous_words.append(X[i].text)

    def __next(self, struct, index):        
        node : Node = self.structure_graph[struct][index]
        next_list = list(node.output_edges.keys())
        options = [(key, node.output_edges[(key, i)]) for key, i in next_list if i == index + 1]

        if not any(options): return None
        return sampler(options)
        
    def __previous(self, struct, index):        
        node : Node = self.structure_graph[struct][index]
        next_list = list(node.input_edges.keys())
        options = [(key, node.input_edges[(key, i)]) for key, i in next_list if i == index - 1]

        if not any(options): return None
        return sampler(options)
    
    def __final_struct(self, struct):
        return struct is None or struct[2]

    def predict(self, struct):
        next_structs, previous_structs = [struct], [struct]
        options = list(self.structure_graph[struct].keys())
        index = choice(options)
        ni, pi = index, index

        while pi > 0 or not self.__final_struct(next_structs[-1]):
            if not next_structs[-1] is None and  not self.__final_struct(next_structs[-1]) :
                n, ni = self.__next(next_structs[-1], ni), ni + 1
                next_structs.append(n)
            if pi > 0 and not previous_structs[-1] is None and not self.__final_struct(previous_structs[-1]):
                p, pi = self.__previous(previous_structs[-1], pi), pi - 1
                previous_structs.append(p)
            
        previous_structs.pop(0)
        previous_structs.reverse()
        return [value for value in previous_structs + next_structs if not value is None]

In [4]:
class StructureSelectByProbabilisticDict:
    def __init__(self, p=1, k=1000000):
        """
            params:
                p: density of probability in that search next step 
                k: length of list in that search next step 
        """
        
        self.p = p
        self.k = k
        self.structure_dict = {}


    def fit(self, X: list[CustomToken]):
        struct = []
        sent = None
        for t in X:
            if sent != t.sent:
                struct.append([])
                sent = t.sent
            s, _ = __token_struct__(t)
            struct[-1].append(s)
        
        for s in struct:
            s = tuple(s)  
            try: 
                self.structure_dict[s] += 1
            except KeyError: 
                self.structure_dict[s] = 1
        
    def predict(self, struct):
        list_ = [(key, self.structure_dict[key]) for key in self.structure_dict if struct in key]
        _len = sum([value for _, value in list_])
        list_.sort(key=lambda x:x[1], reverse=True)
        list_ = list_[0: self.k]
        dist, pivot = [], 0
        for key, value in list_:
            if pivot/_len > self.p: 
                break
            pivot += value
            dist.append((key, value))

        return sampler(dist)

In [5]:
class MarkovGenerativeTextModel:
    def __init__(self, p=1, k=None, sent_length = 2, strength_filter=0, memory_size=1):
        ## TODO csp in structure constraint
        ## TODO kbtree search and filter 
        """
            params:
                p: density of probability in that search next step 
                k: length of list in that search next step 
                sent_length: count of sent generate 
                structure_graph: dict[(type_word, pos_word, is_end_word)] : Node
                language_graph: dict[str] : Node
        """
        
        self.p = p
        self.k = k
        self.sent_length = sent_length
        self.strength_filter = strength_filter
        self.memory_size = memory_size
        self.language_graph = {}
        self.strength = {}

    def __get_language_node__(self, t: CustomToken) -> Node:
        pivot_struct, spacing = __token_struct__(t)
        try: 
            self.language_graph[t.text].add_type(pivot_struct)
        except: 
            self.language_graph[t.text] = Node(t.text, p_space=spacing)
            self.language_graph[t.text].add_type(pivot_struct)
            
        return self.language_graph[t.text]
    
    def __strength_update__(self, t: CustomToken):
        keys = [t.text] + [(word, t.text) for word in self.__previous_words if word != t.text]
        for key in keys:
            try:
                self.strength[key] += 1
            except KeyError:
                self.strength[key] = 1
        

    def fit(self, X: list[CustomToken]):
        self.__current_sent = X[0].sent
        self.__previous_words = []
        
        pivot_word = self.__get_language_node__(X[0])
        self.__strength_update__(X[0])

        for i in range(1, len(X)):
            if X[i].sent != self.__current_sent:
                self.__previous_words = []

            self.__strength_update__(X[i])
            new_word = self.__get_language_node__(X[i])
            
            pivot_word.add_edge(new_word, out=True)
            new_word.add_edge(pivot_word)


            pivot_word = new_word
            self.__previous_words.append(X[i].text)
    
    def __strength_filter(self, state, word_list, _next = True):
        result = []
        for key in word_list:
            for word in state:
                tupl = (word, key) if _next else (key, word)
                try:
                    value = self.strength[tupl]/ self.strength[key]
                    result.append((key, value))
                except KeyError: 
                    pass
        
        result.sort(key=lambda x:x[1], reverse=True)
        return [key for key, _ in set(result[0:self.strength_filter])]
    
    def __struct_filter(self, struct, word_list):
        return [word for word in word_list if struct in self.language_graph[word].types]

    def __probabilistic_filter(self, p_dict):
        p_dict.sort(key=lambda x:x[1], reverse=True)
        _len = sum([value for _, value in p_dict])
        result, pivot = [], 0
        for key, value in p_dict:
            if pivot/_len > self.p: break
            result.append((key, value))
        
        return result

    def __save_filter(self, filter, actual):
        if any(filter): return filter
    
        return actual

    def _next(self, actual_state, struct, dict_func, _next=True):
        word = actual_state[-1]
        _dict = dict_func(word)
        word_list = list(_dict.keys())
        word_init = word_list
        
        word_struct = self.__struct_filter(struct, word_list)
        word_list = self.__save_filter(word_struct, word_list)

        if self.strength_filter > 0:
            word_strength = self.__strength_filter(actual_state[-self.memory_size:], word_list, _next)
            word_list = self.__save_filter(word_strength, word_list)


        p_dict = [(key, _dict[key]) for key in word_list]
        p_dict = self.__save_filter(self.__probabilistic_filter(p_dict), p_dict)

        if len(p_dict) > self.k:
            p_dict = p_dict[0: self.k]
        
        return sampler(p_dict)
        
    def predict(self, word, structure_select_model):
        node: Node = self.language_graph[word]
        struct = sampler(list(node.types.items()))

        sent_struct = structure_select_model.predict(struct)
        index = sent_struct.index(struct)

        previous = [word]
        for i in range(index-1, -1, -1):
            previous.append(
                self._next(previous, sent_struct[i], lambda x: self.language_graph[x].input_edges, _next=False)
            )
        
        _next = [word]
        for i in range(index+1, len(sent_struct)):
            _next.append(
                self._next(_next, sent_struct[i], lambda x: self.language_graph[x].output_edges)
            )

        previous.pop(0)
        previous.reverse()
        result = ''
        for word in previous + _next:
            result += str(self.language_graph[word])
        
        return result

In [6]:
import os
import ast

def length(folder):
    return len(os.listdir(f'{folder}'))


def load(folder):
    for filename in os.listdir(f'{folder}'):
        with open(f'{folder}/{filename}', 'r') as f:
            text = f.read()
            f.close()
            yield ast.literal_eval(text)
        
def pb(len_, name = ""):
    bar = progressbar.ProgressBar(len_, widgets=[progressbar.Bar('🥵', '[', ']', ), name, progressbar.Percentage()])
    bar.start()
    return bar

def data_to_token(data):
    token = CustomToken(data["text"], lex = data['lemma'], is_stop=data['is_stop'], is_sy=data['is_symbol'], sent=data['sent'])
    token.syntax = data['syntax']
    return token


In [7]:
text_generator = MarkovGenerativeTextModel(p=0.8, k=20, strength_filter=10, memory_size=4)
structure_graph_model = StructureSelectByMarkovModel(p=0.7, k=20)
probabilistic_structure = StructureSelectByProbabilisticDict(p=0.6, k=50)

## TODO evaluate model
## TODO parameter optimization

In [8]:
_len_ = length('tokens')
bar = pb(_len_, f" training model {_len_} ")

for i, data in enumerate(load('tokens')):
    try:
        tokens = [data_to_token(obj) for obj in data[1] if obj['text'] != '']
    except:
        continue
    if not any(tokens): continue
    text_generator.fit(tokens)
    structure_graph_model.fit(tokens)
    probabilistic_structure.fit(tokens)
    # if i > _len_/100 : break
    bar.update(i+1)
bar.finish()

[🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵] training model 324937 100%


In [9]:
text_generator.predict('vida', structure_graph_model)

' yanqui q me to What It’s making me of# CubaSalva# LoveDeathRobots este dia de @MemeruemXD @L_Lawlietc Soy fans roast-beafayant divulgué la targeta la alimetacion de CREO QUE RARA VEZ MAS ALLÁ PERO PARA QUE RESOLVER NADA DETODO PERO SI TIENE EN CUENTA PROPISTAS Y QUE HAY NI CUIDAR A mi vida nada incomoda mas bonito que codician entonces no nossigas yse logre salir pedrito el medio de paqar asociación que seincluyen algunos chubascos y siria Iraq para enfrentarla dialogarán hoy sea honesto trabajo para colmo entonces compraría toda vela la pesza de lis que cortarlo por tantas cosas perturbadoras: ARRIBA DE OTRA COSA QUE TAN LAMENTABLE LO JURO! traidores más sentidas condolencias y ventajistas'

In [10]:
text_generator.predict('vida', structure_graph_model)

IndexError: list index out of range

In [11]:
text_generator.predict('vida', structure_graph_model)

', para informarte q las prueva se deberían mejorar financieramente estábamos en vidasexual q usa armas a nuestros ancestros y vini xq planes de ésa es apercibida por EDICIONES ILCSA S I am up and you go to be clear as my passion as minhas músicas estridente hasta disimularlo. Momento oportuno hacer con pinzas y quimico contra trump amadrazos y electoras que le esta fuera y pósteriormentelos q usa nasobuco pueden tenerlas aqui comentan distorsionando los yanqui se puede defenderce como le hubieran más a nuestros qnunkllegara este bienése sector en cup q lienso se le retrocediera el banquero presidente y consultándolo todo alentando a q cr 7 Pues para todos lados y como otros usan poco mas cereyentes de ésa es ser desnazificada ydeivados y sr esto seremos escuchados y recluye en qtodo carismo una misma para colmo es repudiado por quésucedio en manzanillo que eldependiente quiera estremarse para cojer el monaco el usdc. mas pié porque él y q todo esmero a lo colgarón aqui como si percibe

In [12]:
text_generator.predict('vida', structure_graph_model)

IndexError: list index out of range

In [13]:
text_generator.predict('vida', structure_graph_model)

' la verda q el pavimento y cuba q me alegro mucho teiempo porvida que dividirse en estar enrregresar al propio pais productor de pluriemplear a q nuestro beisboll que estén bien se dijohace años ahogados en cubasel por encimad q tanto estomatólogo por leerme, há sequelas relacionadas à Sochaux. Y agregale que como lesugiero la tentasión.mesa de paqar asociación ilícita política y reciclajes que leerle la víaestatal en cupa q compártieran con nuestros fueran animales y jodidamente filosofica la misma cituacion con pinzas ya todo esmero a nuestros cientificos para vendertela al respecto al magisterio en detrimento q pueda pienso en cup como'

In [14]:
text_generator.predict('vida', probabilistic_structure)

TypeError: list.append() takes exactly one argument (2 given)

In [15]:
text_generator.predict('vida', probabilistic_structure)

TypeError: list.append() takes exactly one argument (2 given)

In [None]:
# _len_ = length('words')
# bar = pb(_len_, f" training kd tree {_len_} ")

# dataset = {}
# for i, data in enumerate(load('words')):
#     if not any(data[0]): continue
#     dataset[data[0]] = data[1]
#     bar.update(i+1)
# bar.finish()

[🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵                                ] training kd tree 252507  35%

KeyboardInterrupt: 

In [None]:
# X = np.array(dataset.items())
# tree = KDTree(X)              
# dist, ind = tree.query(X[:1], k=3)                
# print(ind)  # indices of 3 closest neighbors
# print(dist)  # distances to 3 closest neighbors

TypeError: float() argument must be a string or a number, not 'dict_items'