# Creando un (simple) modelo de lenguaje

In [None]:
import nltk
from nltk import sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
import string

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
dialogos = []
with open("conferencias.txt") as r:
    for line in r:
        dialogos.append(line.strip())

In [None]:
tt = ToktokTokenizer()

In [None]:
stopword_es = set(nltk.corpus.stopwords.words('spanish'))
punctuation = set(string.punctuation + '¡¿…')

In [None]:
def tokenize_dialogo(dialogo):
    tokens = []
    for sentence in sent_tokenize(dialogo):
        for token in tt.tokenize(sentence):
            token = token.lower()
            # if token in stopword_es:
            #     continue
            # if token in punctuation:
            #     continue
            tokens.append(token)
    return tokens

In [None]:
diálogo = ("Y también nos planteó que se atendiera a jóvenes en casas especiales para terapias y apoyo a personas "
           "con discapacidad. También, ya se buscó una alternativa y ya tenemos una respuesta.")

tokens = tokenize_dialogo(diálogo)

print(" - ".join(tokens))

In [None]:
from itertools import chain

training_tokens = [
    tokenize_dialogo(dialogo) for dialogo in dialogos
]

In [None]:
training_tokens[0]

## Train a new Vocabulary

In [None]:
%load_ext autoreload
%autoreload 2
    
from vocabulary import Vocabulary

In [None]:
vocab = Vocabulary()
vocab.fit(training_tokens)

In [None]:
print(list(vocab.tokenset_)[:10])
vocab.numero_tokens_

In [None]:
vocab.single_token_counts_.most_common(10)

## Crea un nuevo modelo de lenguaje

In [None]:
%load_ext autoreload
%autoreload 2
    
from add_k_trigram_lm import AddKTrigramLM

lm = AddKTrigramLM(k=0.1)

In [None]:
lm.fit(vocab.transform(training_tokens))

In [None]:
lm.totals_[('<p>','<p>')]['muchas']

In [None]:
lm.next_token_proba('muchas', ['<p>','<p>'])

In [None]:
class GeneradorSecuencias:

    def __init__(self, language_model, vocabulary):
        self.lm = language_model
        self.vocab = vocabulary

    def sample_next(self, *sequence):
        # This looks through each word in the vocab and gets its conditional probability.
        # This can be slow if the vocabulary is very large; we could do better.
        probs = [lm.next_token_proba(word, sequence) for word in self.lm.tokens_]
        
        # Pick a word at random according to its conditional probability
        return np.random.choice(self.lm.tokens_, p=probs)
        

    def genera_secuencia(self, *start, max_length = 200):
        # Given it the start sequence to indicate the start of a post. 
        seq = [self.vocab.START_TOKEN, self.vocab.START_TOKEN]
        if start:
            seq.extend(start)
        for i in range(max_length):
            seq.append(self.sample_next(*seq))
            # Stop at post
            if seq[-1] == self.vocab.END_TOKEN:
                break
        return " ".join(seq)

In [None]:
generador = GeneradorSecuencias(lm, vocab)
generador.genera_secuencia('suprema', 'corte')

In [None]:
lm._V