<a href="https://colab.research.google.com/github/flying-bear/kompluxternaya/blob/master/assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Assignment 1

Using text http://www.gutenberg.org/files/2600/2600-0.txt
1. Make text lowercase and remove all punctuation except spaces and dots.
2. Tokenize text by BPE with vocab_size = 100
3. Train 3-gram language model with laplace smoothing $\delta=1$
4. Using beam search with k=10 generate sequences of length=10 conditioned on provided inputs. Treat dots as terminal tokens.
5. Calculate perplexity of the language model for the first sentence.

In [0]:
import nltk
import numpy as np
import re

from collections import Counter
from google.colab import drive
from sklearn.base import TransformerMixin

In [86]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
root_path = 'gdrive/My Drive/studies/HSE/prog/kompluxternaya'
with open(root_path+'/'+'peace.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [96]:
def preprocess_text(text):
    # TODO
    # make lowercase
    text = text.lower()
    # replace all punctuation except dots with spaces
    pattern = '!|"|#|\$|%|&|\'|\(|\)|\*|\+|,|-|/|:|;|<|=|>|\?|@|\[|\\|\]|^|_|`|{|\||}|~|”|“|—|‘|’'
    text = re.sub(pattern, ' ', text)
    # collapse multiple spaces into one '   ' -> ' '
    text = re.sub('\s+', ' ', text)
    return text

text = preprocess_text(text)
#assert len(text) == 3141169
len(text)

3141171

In [0]:
text = text.split('.')
text = [x.strip() for x in text]

In [0]:
def list_to_bigrams(l):
  """
  takes a list of lists of integers or strings
  returns a counter of their pairs (tuples) across the lists
  """
  bigrams = Counter()
  for i in range(len(l)):
    bigrams.update((x, y) for x, y in zip(*[l[i][j:] for j in range(2)]))
  return bigrams

In [0]:
def update_token(text, token, token_id):
  """
  takes 
  - a text (list of lists of integers)
  - a token (integer tuple)
  - the id of the given token (integer)
  returns a modified text, where all instances of the token (element pairs) are replaced by token_id
  """
  text_new = [inner_list[:] for inner_list in text]
  for i, sent in enumerate(text):
    deletions = 0
    for j, (v, w) in enumerate(zip(sent[:-1], sent[1:])):
      if (v, w) == token:
        text_new[i][j-deletions] = token_id
        del text_new[i][j-deletions-1]
        deletions += 1
  return text_new

In [78]:
t = ['бля бля бл', ' ', 'бля']
test_itos = list(set(''.join(t))) # list letters
test_stoi = dict(zip(test_itos, range(len(test_itos)))) # dict of letters and their ids
t = [[test_stoi[x] for x in t[i]] for i in range(len(t))] # replace letters with thir ids (str to list)
vocab_size = 12
while len(test_itos) < vocab_size:
    bigrams = list_to_bigrams(t) # get text bigrams
    if bigrams.most_common(1):
      new_token = bigrams.most_common(1)[0][0] # find most common bigram
      new_id = len(test_itos)
      test_itos.append(new_token)
      test_stoi[new_token] = new_id
      # find occurences of the new_token in the text and replace them with new_id
      t = update_token(t, new_token, new_id)
    else:
      break
print(t)
print(test_itos)

[[8], [0], [5]]
[' ', 'б', 'я', 'л', (1, 3), (4, 3), (3, 4), (6, 4), (7, 6)]


In [61]:
itos = ['а', 'е', 'и', 'р', (1,2), (0,3), (4,0), (4,5)]
def recursive_token_lookup(tok): #tok int or tuple
  if type(tok) == int:
    content = itos[tok]
    if type(content) == str:
      return content
    else:
      return recursive_token_lookup(content)
  elif type(tok) == tuple:
    return recursive_token_lookup(tok[0]) + recursive_token_lookup(tok[1])
  
recursive_token_lookup(6)

'еиа'

In [0]:
class BPE(TransformerMixin):
    def __init__(self, vocab_size=100):
        super(BPE, self).__init__()
        self.vocab_size = vocab_size
        # index to token
        self.itos = []
        # token to index
        self.stoi = {}

    def fit(self, text):
        """
        takes a text (list of strings)
        fits 
        - self.itos (a list of strings (symbols) and integer tuples (most frequent bigrams while vocabulary does not exceed vocab_size
                      each symbol/bigram is indexed by its place in the list) )
        - self.stoi (a dict of symbols and bigrams to thir indicies in the self.stoi list)
        - text (symbols replaced by thir ids, bigrams of ids replaced by their ids)
        returns self
        """
        # tokenize text by symbols and fill in self.itos and self.stoi
        self.itos = list(set(''.join(text))) # list letters
        self.stoi = dict(zip(self.itos, range(len(self.itos)))) # dict of letters and their ids
        text = [[self.stoi[x] for x in text[i]] for i in range(len(text))] # replace letters with thir ids (str to list)
        
        while len(self.itos) < self.vocab_size:
            bigrams = list_to_bigrams(text) # get text bigrams
            if bigrams.most_common(1):
              new_token = bigrams.most_common(1)[0][0] # find most common bigram
              new_id = len(self.itos)
              self.itos.append(new_token)
              self.stoi[new_token] = new_id
              # find occurences of the new_token in the text and replace them with new_id
              text = update_token(text, new_token, new_id)
            else:
              break
        return self
    
    def transform(self, text):
        """
        takes a text (list of strings)
        convert text to a sequence of symbol ids then replaces bigrams of ids with their indicies in self.stoi
        returns modified text
        """
        text_in_vocabulary = [[symbol for symbol in sent if symbol in self.itos] for sent in text] # exclude out of vocabulary symbols
        text = [[self.stoi[letter] for letter in sent] for sent in text_in_vocabulary] # tokenize text by symbols using self.stoi
        for token_id, token in enumerate(self.itos): # find occurences of a complex token in the text and replace them with token_id
            text = update_token(text, token, token_id)    
        return text
    
    def decode_token(self, tok):
        """
        takes a tok (either an int - id, or a tuple - pair of ids)
        returns a text coded by the tok
        """
        def recursive_token_lookup(token): #token int or tuple
          if type(token) == int:
            content = itos[token]
            if type(content) == str:
              return content # only returns strings
            else:
              return recursive_token_lookup(content) # continue lookup on the tuple that was found
          elif type(token) == tuple:
            return recursive_token_lookup(token[0]) + recursive_token_lookup(token[1]) # concatenate string results
        return recursive_token_lookup(tok)
            
    def decode(self, text):
        """
        convert token ids into text
        """
        return ''.join(map(self.decode_token, text))
        
        
vocab_size = 100
bpe = BPE(vocab_size)
tokenized_text = bpe.fit_transform(text)

In [93]:
assert bpe.decode(tokenized_text[0]) == text[0]

AssertionError: ignored

In [0]:
start_token = vocab_size
end_token = vocab_size + 1
        
    
class LM:
    def __init__(self, vocab_size, delta=1):
        self.delta = delta
        self.vocab_size = vocab_size + 2
        self.proba = # TODO create array for storing 3-gram counters
        
    def infer(self, a, b, tau=1):
        """
        return vector of probabilities of size self.vocab for 3-grams which start with (a,b) tokens
        a: first token id
        b: second token id
        tau: temperature
        """
        result = # TODO
        return result
        
    def get_proba(self, a, b, c, tau=1):
        """
        get probability of 3-gram (a,b,c)
        a: first token id
        b: second token id
        c: third token id
        tau: temperature
        """
        result = # TODO approximate probability by counters
        return result
    
    def fit(self, text):
        """
        train language model on text
        text: list of lists
        """
        self.proba = # TODO count 3-grams in the text
        
        return self
    
lm = LM(vocab_size, 1).fit(tokenized_text)

In [0]:
def beam_search(input_seq, lm, max_len=10, k=5, tau=1):
    """
    generate sequence from language model *lm* conditioned on input_seq
    input_seq: sequence of token ids for conditioning
    lm: language model
    max_len: max generated sequence length
    k: size of beam
    tau: temperature
    """
    
    beam = # TODO store in beam tuples of current sequences and their log probabilities
    
    for i in range(max_len):
        candidates = []
        candidates_proba = []
        for snt, snt_proba in beam:
            if # TODO process terminal token
            else:    
                proba = # probability vector of the next token
                best_k = # top-k most probable tokens
                # TODO update candidates' sequences and corresponding probabilities
                
        beam = # select top-k most probable sequences from candidates
    return beam
    

In [0]:
input1 = 'horse '
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
# TODO print decoded generated strings and their probabilities
    

In [0]:
input1 = 'her'
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
# TODO print decoded generated strings and their probabilities

In [0]:
input1 = 'what'
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=1)
# TODO print decoded generated strings and their probabilities

In [0]:
input1 = 'gun '
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
# TODO print decoded generated strings and their probabilities

In [0]:
def perplexity(snt, lm):
    """
    snt: sequence of token ids
    lm: language model
    """
    result = #TODO perplexity for the sentence
    return result

perplexity(tokenized_text[0], lm)