## Entropy rate using a neural language model

The libraries we are going to use:

In [2]:
import torch         #For Neural Networks
import torch.nn as nn  #For Neural Networks
import numpy as np     
from nltk import ngrams   #Extracting ngrams from any string
import os #Reading file directories and other options
from tqdm import tqdm #Progress bar
import csv  #for writing output in csv

We define functions that we are going to need later:

In [3]:
#Function to obtain character ngrams from a text:
def get_char_ngrams(text, n):
  ngrams_list=[]
 
  for ngram in ngrams(text, n):
      ngram_string="".join(ngram)
      ngrams_list.append(ngram_string)
  return(ngrams_list)  #Returns a list of the char ngrams contained in the input text

#Function to convert the input text (already converted to char ngrams) to index numbers:
def text2number(char_ngrams):
  index=0
  indexed_line=[]
  vocabulary={}
  for t in char_ngrams: #for each ngram in the list
    if not(t in vocabulary): #If we didn't indexed it already
      vocabulary[t]=index
      #print(w, index)
      index=index+1
         
    indexed_line.append(vocabulary[t])

  return (vocabulary, indexed_line)  #Returns a dictionary of the index values, and the text (already converted to char ngrams) converted to numeric indexes 

**Toy example**
First let see how to preprocess the text in order to prepare the input for the neural network.

1. First we get char trigrams:

In [4]:
toycorpus = 'G I Z A E S K U B I D E E N A L D A R R I K A P E N U N I B E R T S A L A H I T Z A U R R E A K o n t u a n i z a n i k m u n'

text=toycorpus.lower().replace(" ", "")  #We lowercase and remove spaces
char_trigrams=get_char_ngrams(text,3)   #We obtain trigrams of characters
print(char_trigrams)

['giz', 'iza', 'zae', 'aes', 'esk', 'sku', 'kub', 'ubi', 'bid', 'ide', 'dee', 'een', 'ena', 'nal', 'ald', 'lda', 'dar', 'arr', 'rri', 'rik', 'ika', 'kap', 'ape', 'pen', 'enu', 'nun', 'uni', 'nib', 'ibe', 'ber', 'ert', 'rts', 'tsa', 'sal', 'ala', 'lah', 'ahi', 'hit', 'itz', 'tza', 'zau', 'aur', 'urr', 'rre', 'rea', 'eak', 'ako', 'kon', 'ont', 'ntu', 'tua', 'uan', 'ani', 'niz', 'iza', 'zan', 'ani', 'nik', 'ikm', 'kmu', 'mun']


2. We represent the text (already converted to trigrams) as numeric indexes. This way the neural network will be able to process it.

In [5]:
out=text2number(char_trigrams) #it returns 2 objects, the dictionary with indexes, and the converted text
idx=out[0]  #the dictionary with indexes
indexed_corpus=out[1] #the converted text
print(idx)
print(indexed_corpus)

{'giz': 0, 'iza': 1, 'zae': 2, 'aes': 3, 'esk': 4, 'sku': 5, 'kub': 6, 'ubi': 7, 'bid': 8, 'ide': 9, 'dee': 10, 'een': 11, 'ena': 12, 'nal': 13, 'ald': 14, 'lda': 15, 'dar': 16, 'arr': 17, 'rri': 18, 'rik': 19, 'ika': 20, 'kap': 21, 'ape': 22, 'pen': 23, 'enu': 24, 'nun': 25, 'uni': 26, 'nib': 27, 'ibe': 28, 'ber': 29, 'ert': 30, 'rts': 31, 'tsa': 32, 'sal': 33, 'ala': 34, 'lah': 35, 'ahi': 36, 'hit': 37, 'itz': 38, 'tza': 39, 'zau': 40, 'aur': 41, 'urr': 42, 'rre': 43, 'rea': 44, 'eak': 45, 'ako': 46, 'kon': 47, 'ont': 48, 'ntu': 49, 'tua': 50, 'uan': 51, 'ani': 52, 'niz': 53, 'zan': 54, 'nik': 55, 'ikm': 56, 'kmu': 57, 'mun': 58}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 1, 54, 52, 55, 56, 57, 58]


3. Once we have lists of numeric indexes representing the char ngrams in words, we'll just format them as training pairs (x,y) for the neural network. 

These training pairs are used by the neural network to model p(wj|wi), i.e., the probability of a char trigram given another char trigram in that language. 

In [6]:
training_bigrams = list(ngrams(indexed_corpus, 2))  #We extract bigrams of char trigrams to form training pairs: (x,y)
print("Training_bigrams per word:")
print(training_bigrams)

print("We make two lists, one with all x values, another one with Y values. These will enter to the NN:")
print(list(zip(*training_bigrams)))

Training_bigrams per word:
[(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23), (23, 24), (24, 25), (25, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31), (31, 32), (32, 33), (33, 34), (34, 35), (35, 36), (36, 37), (37, 38), (38, 39), (39, 40), (40, 41), (41, 42), (42, 43), (43, 44), (44, 45), (45, 46), (46, 47), (47, 48), (48, 49), (49, 50), (50, 51), (51, 52), (52, 53), (53, 1), (1, 54), (54, 52), (52, 55), (55, 56), (56, 57), (57, 58)]
We make two lists, one with all x values, another one with Y values. These will enter to the NN:
[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 1, 54, 52, 55, 56, 57), (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 

**4. Training the Neural Network**

In [7]:
#We split the training pairs in two different lists (x and y), and we convert them to tensor data format. This is required by Pytorch:
x,y = torch.tensor(list(zip(*training_bigrams)), dtype=torch.long)
print(x,y)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
         1, 54, 52, 55, 56, 57]) tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,  1,
        54, 52, 55, 56, 57, 58])


In [21]:
#Neural Netowrk Parameters
dim_in = len(idx)  #The dimension of the input layer (the size of the vocabulary)
dim = 300     #The dimension of the embedding layer
dim_h = 100  #The dimension of the hidden layer
dim_out = len(idx)  #The dimension of the output layer (the size of the vocabulary)

#We define the architecture of the Neural Network (feed forward) layer by layer:
#The final layer (output) is a softmax from which we will retrieve the probabilities we are looking for (once is trained):
forward = nn.Sequential(nn.Embedding(dim_in,dim), nn.Linear(dim,dim_h), nn.Tanh(), nn.Linear(dim_h,dim_out), nn.Softmax(1)) 


#We define a loss function for the training:
criterion = torch.nn.CrossEntropyLoss()

#Choosing the optimization algorithm (Stochastic gradient descent) and the learning rate
optimizer = torch.optim.SGD(forward.parameters(), lr=0.1)

#Number of iterations
its = 10

#We train the model
for epoch in tqdm(range(its)):
  for x,y in training_bigrams:
      #FORWARD
    x_in=torch.tensor([x], dtype=torch.long)
    y_pred = forward(x_in)

     #BACKWARD
    #Error calculation
    y_targ=torch.tensor([y], dtype=torch.long)
    loss = criterion(target=y_targ, input=y_pred)
    #zero grad
    optimizer.zero_grad()
    #Backprop
    loss.backward()
    #We update the values
    optimizer.step()

100%|██████████| 10/10 [00:00<00:00, 30.42it/s]


Once is trained we extract the values from the output layer and we construct our stochastic matrix:

In [22]:
total_index_vocabulary = torch.tensor(list(idx.values()))
stochastic_matrix = forward(total_index_vocabulary).detach()


mu = (1/len(idx))*stochastic_matrix.sum(0)
entropy_rate = -(mu*(stochastic_matrix*np.log(stochastic_matrix)).sum(1)).sum(0)/np.log(len(idx))
 
print('Entropy rate:', entropy_rate)

Entropy rate: tensor(0.5690)


##Training on the corpus

In [None]:
directory="/content/processed/" #Data source (change it to your Data location)
all_files=[f for f in os.listdir(directory) if not f.startswith('.')] #listing all files in a directory (hidden files excluded)

f = open('/content/entropyRate(trigrams).csv', 'w') #Output CSV (change it to your Data location)
writer = csv.writer(f)  # create the csv writer
csvheader = ['filename', 'Hrate.trigrams']  #The header of your file
writer.writerow(csvheader)
size_char_ngrams=3

################################################################################

for n in all_files:
  print('Corpus:', n)
  inputcorpus=directory+n
  file=open(inputcorpus,'r', encoding="utf-8")
  text=file.read().lower().replace(" ", "")  #We lowercase and remove spaces
  char_ngrams=get_char_ngrams(text,size_char_ngrams)

  out=text2number(char_ngrams) #it returns 2 objects, the dictionary with indexes, and the converted text
  idx=out[0]  #the dictionary with indexes
  indexed_corpus=out[1] #the converted text

  training_bigrams = list(ngrams(indexed_corpus, 2))

  
  x,y = torch.tensor(list(zip(*training_bigrams)), dtype=torch.long)

  #Neural Network Specifications
  dim_in = len(idx)  #The dimension of the input layer (the size of the vocabulary)
  dim = 300     #The dimension of the embedding layer
  dim_h = 100  #The dimension of the hidden layer
  dim_out = len(idx)  #The dimension of the output layer (the size of the vocabulary)

  #We define the architecture of the Neural Network (feed forward) layer by layer:
  #The final layer (output) is a softmax from which we will retrieve the probabilities we are looking for (once is trained):
  forward = nn.Sequential(nn.Embedding(dim_in,dim), nn.Linear(dim,dim_h), nn.Tanh(), nn.Linear(dim_h,dim_out), nn.Softmax(1)) 


  #We define a loss function for the training:
  criterion = torch.nn.CrossEntropyLoss()

  #Choosing the optimization algorithm (Stochastic gradient descent) and the learning rate
  optimizer = torch.optim.SGD(forward.parameters(), lr=0.1)

  #Number of iterations
  its = 35

  #We train the model
  for epoch in tqdm(range(its)):
    for x,y in training_bigrams:
        #FORWARD
      x_in=torch.tensor([x], dtype=torch.long)
      y_pred = forward(x_in)

      #BACKWARD
      #Error calculation
      y_targ=torch.tensor([y], dtype=torch.long)
      loss = criterion(target=y_targ, input=y_pred)
      #zero grad
      optimizer.zero_grad()
      #Backprop
      loss.backward()
      #We update the values
      optimizer.step()

  total_index_vocabulary = torch.tensor(list(idx.values()))
  stochastic_matrix = forward(total_index_vocabulary).detach()


  mu = (1/len(idx))*stochastic_matrix.sum(0)
  entropy_rate = -(mu*(stochastic_matrix*np.log(stochastic_matrix)).sum(1)).sum(0)/np.log(len(idx))

  
  print('Entropy rate:', entropy_rate)

  csvrow = [n, float(entropy_rate)]
  writer.writerow(csvrow)
 
  
f.close() # close the CSV file