The libraries we are going to use:

In [107]:
import torch
import torch.nn as nn
import numpy as np
from collections import defaultdict, Counter
from itertools import chain
from nltk import ngrams

We define functions that we are going to need later:

In [108]:
#Functions to obtain char ngrams from a text:
def getngrams(text, n):
  ngrams_list=[]
  for w in text:
    aux=[]
    w="#"+w+"$"
    for ngram in ngrams(w, n):
      ngram_string="".join(ngram)
      aux.append(ngram_string)
    ngrams_list.append(aux)
  return(ngrams_list)  #Returns a list of char ngrams for each word

#Function to convert the input text to index numbers:
def text2number(text):
  index=0
  indexed_line=[]
  vocabulary={}
  for word in text:
    aux=[]
    for t in word: #for each ngram in word
      if not(t in vocabulary):
        vocabulary[t]=index
        #print(w, index)
        index=index+1
      aux.append(vocabulary[t])
        
    indexed_line.append(aux)

  return (vocabulary, indexed_line)  #Returns a dictionary of the index values, and the text converted to numeric indexes 

**Toy example**
First let see how to preprocess the text in order to prepare the input for the neural network.

1. First we get char trigrams:

In [109]:
toycorpus = 'The dog eats A boy eats The dog eats food'

text=toycorpus.split()
char_trigrams=getngrams(text,3)
print(char_trigrams)

[['#Th', 'The', 'he$'], ['#do', 'dog', 'og$'], ['#ea', 'eat', 'ats', 'ts$'], ['#A$'], ['#bo', 'boy', 'oy$'], ['#ea', 'eat', 'ats', 'ts$'], ['#Th', 'The', 'he$'], ['#do', 'dog', 'og$'], ['#ea', 'eat', 'ats', 'ts$'], ['#fo', 'foo', 'ood', 'od$']]


2. We represent the text (already converted to trigrams) as numeric indexes. This way the neural network will be able to process it.

In [110]:
out=text2number(char_trigrams) #it returns 2 objects, the dictionary with indexes, and the converted text
idx=out[0]  #the dictionary with indexes
indexed_corpus=out[1] #the converted text
print(idx)
print(indexed_corpus)

{'#Th': 0, 'The': 1, 'he$': 2, '#do': 3, 'dog': 4, 'og$': 5, '#ea': 6, 'eat': 7, 'ats': 8, 'ts$': 9, '#A$': 10, '#bo': 11, 'boy': 12, 'oy$': 13, '#fo': 14, 'foo': 15, 'ood': 16, 'od$': 17}
[[0, 1, 2], [3, 4, 5], [6, 7, 8, 9], [10], [11, 12, 13], [6, 7, 8, 9], [0, 1, 2], [3, 4, 5], [6, 7, 8, 9], [14, 15, 16, 17]]


3. Once we have lists of numeric indexes representing the char ngrams in words, we'll just format them as training pairs (x,y) for the neural network  

In [111]:
training_bigrams = [list(ngrams(word, 2)) for word in indexed_corpus]
print("Training_bigrams per word:")
print(training_bigrams)
flat_list = []
for sublist in training_bigrams:
    for item in sublist:
        flat_list.append(item)
print("Training_bigrams in a single list:")
print(flat_list)
#list(zip(*bigrams))
print("We make two lists, one with all x values, another one with Y values. These enter to the NN:")
print(list(zip(*flat_list)))

training_bigrams per word:
[[(0, 1), (1, 2)], [(3, 4), (4, 5)], [(6, 7), (7, 8), (8, 9)], [], [(11, 12), (12, 13)], [(6, 7), (7, 8), (8, 9)], [(0, 1), (1, 2)], [(3, 4), (4, 5)], [(6, 7), (7, 8), (8, 9)], [(14, 15), (15, 16), (16, 17)]]
Training_bigrams in a single list:
[(0, 1), (1, 2), (3, 4), (4, 5), (6, 7), (7, 8), (8, 9), (11, 12), (12, 13), (6, 7), (7, 8), (8, 9), (0, 1), (1, 2), (3, 4), (4, 5), (6, 7), (7, 8), (8, 9), (14, 15), (15, 16), (16, 17)]
We make two lists, one with all x values, another one with Y values. These enter to the NN:
[(0, 1, 3, 4, 6, 7, 8, 11, 12, 6, 7, 8, 0, 1, 3, 4, 6, 7, 8, 14, 15, 16), (1, 2, 4, 5, 7, 8, 9, 12, 13, 7, 8, 9, 1, 2, 4, 5, 7, 8, 9, 15, 16, 17)]


In [112]:
#Se pasan los datos a formato torch.tensor
x,y = torch.tensor(list(zip(*flat_list)), dtype=torch.long)

print(x,y)

tensor([ 0,  1,  3,  4,  6,  7,  8, 11, 12,  6,  7,  8,  0,  1,  3,  4,  6,  7,
         8, 14, 15, 16]) tensor([ 1,  2,  4,  5,  7,  8,  9, 12, 13,  7,  8,  9,  1,  2,  4,  5,  7,  8,
         9, 15, 16, 17])


In [113]:
#Se definen los parámetros
dim_in = len(idx)
dim = 300
dim_h = 100
dim_out = len(idx)

#Se define la arquitectura de la red (forward)
forward = nn.Sequential(nn.Embedding(dim_in,dim), nn.Linear(dim,dim_h), nn.Tanh(), nn.Linear(dim_h,dim_out), nn.Softmax(1))

#Se define la función de Riesgo (Entropía cruzada)
criterion = torch.nn.CrossEntropyLoss()

#Se define la forma de optimizar (Gradiente Descendiente Estocástico)
optimizer = torch.optim.SGD(forward.parameters(), lr=0.1)

#Numero de iteraciones
its = 100

#Se entrena el modelo
for epoch in range(its):
	#FORWARD
	y_pred = forward(x)

	#BACKWARD
	#Se calcula el eror
	loss = criterion(y_pred, y)
	#zero grad
	optimizer.zero_grad()
	#Backprop
	loss.backward()
	#Se actualizan los parametros
	optimizer.step()

In [114]:
forward(torch.tensor([4, 3, 5]))

tensor([[1.8087e-03, 1.8139e-03, 1.5639e-03, 1.4914e-03, 1.0422e-03, 9.7354e-01,
         2.3855e-03, 1.1801e-03, 1.1771e-03, 9.4962e-04, 2.3034e-03, 2.2108e-03,
         8.4550e-04, 2.2193e-03, 6.4067e-04, 2.2809e-03, 1.7790e-03, 7.6870e-04],
        [1.3503e-03, 1.1519e-03, 9.3013e-04, 2.4567e-03, 9.7826e-01, 1.0793e-03,
         8.9538e-04, 6.7726e-04, 2.6520e-04, 9.9960e-04, 1.5444e-03, 1.4471e-03,
         1.2683e-03, 1.6494e-03, 1.5218e-03, 1.0875e-03, 1.3320e-03, 2.0810e-03],
        [7.1858e-02, 1.3226e-01, 5.0552e-02, 4.5914e-02, 6.8726e-02, 1.1550e-01,
         2.7136e-02, 6.3509e-02, 3.5941e-02, 1.7463e-02, 3.3605e-02, 6.5861e-02,
         5.6442e-02, 3.7120e-02, 4.4536e-02, 4.2710e-02, 4.1959e-02, 4.8914e-02]],
       grad_fn=<SoftmaxBackward0>)

In [115]:
total_index_vocabulary = torch.tensor(list(out[0].values()))
print(total_index_vocabulary)
stochastic_matrix = forward(total_index_vocabulary).detach()
print(stochastic_matrix)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])
tensor([[1.8071e-03, 9.7669e-01, 2.1356e-04, 1.2520e-03, 1.1186e-03, 1.6606e-03,
         1.1259e-03, 2.4686e-03, 8.8755e-04, 1.2034e-03, 1.5772e-03, 2.0886e-03,
         1.7688e-03, 5.3331e-04, 9.6659e-04, 1.7569e-03, 1.7161e-03, 1.1700e-03],
        [1.3232e-03, 3.5071e-04, 9.7381e-01, 1.8180e-03, 1.1995e-03, 1.8354e-03,
         1.2123e-03, 1.0623e-03, 1.5120e-03, 3.9787e-03, 1.3511e-03, 1.3930e-03,
         1.7087e-03, 2.4099e-03, 9.0610e-04, 1.9463e-03, 5.9042e-04, 1.5969e-03],
        [6.1099e-02, 4.3456e-02, 9.1318e-02, 4.0740e-02, 4.9374e-02, 2.4417e-02,
         2.6835e-02, 5.8201e-02, 2.2128e-02, 4.2882e-02, 3.1887e-02, 4.5851e-02,
         1.7363e-01, 6.0953e-02, 4.8396e-02, 9.6602e-02, 4.9141e-02, 3.3093e-02],
        [1.3503e-03, 1.1519e-03, 9.3013e-04, 2.4567e-03, 9.7826e-01, 1.0793e-03,
         8.9538e-04, 6.7726e-04, 2.6520e-04, 9.9960e-04, 1.5444e-03, 1.4471e-03,
         1.2683e-03, 1.64

In [120]:
mu = (1/len(idx))*stochastic_matrix.sum(0)

entropy_rate = -(mu*(stochastic_matrix*np.log(stochastic_matrix)).sum(1)).sum(0)/np.log(len(idx))

print("Entropy rate:", float(entropy_rate))

Entropy rate: 0.4612385034561157
