# Project 3 - Sequence models

igu011 and edj001

In [4]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from datetime import datetime
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import torchtext
from os import listdir
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

torch.manual_seed(123)
torch.set_default_dtype(torch.double)

device = torch.device("cpu")
print(device)

cpu


## 2.1 Word embedding

In [5]:
# tokenizer will split a long text into a list of english words
tokenizer = get_tokenizer('basic_english')

def read_files(datapath='./'):
    """
    Return a list of strings, one for each line in each .txt files in 'datapath'
    """
    # Find all txt files in directory 
    files = listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]

    
    # Stores each line of each book in a list
    lines = []
    for f_name in files:
        with open(f_name) as f:
            lines += f.readlines()
    return lines

books = read_files(datapath='./data_train/')

# Match any word containing digit
no_digits = '\w*[0-9]+\w*'
# Match word containing a uppercase 
no_names = '\w*[A-Z]+\w*'
# Match any sequence containing more than one space
no_spaces = '\s+'


def tokenize(lines):
    """
    Tokenize the list of lines
    """
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text

def yield_tokens(lines):
    """
    Yield tokens, ignoring names and digits to build vocabulary
    """
    for line in lines:
        line = re.sub(no_digits + "|" + no_names, ' ', line)
        line = re.sub(no_spaces, ' ', line)
        yield tokenizer(line)

def count_freqs(data, vocab):
    """
    Count occurrences of each word in vocabulary in the data
    """
    freqs = torch.zeros(len(vocab), dtype=torch.int)
    for w in data:
        freqs[vocab[w]] += 1
    return freqs

# List of words contained in the dataset
list_words = tokenize(books)

# vocab contains the vocabulary found in the data, associating an index to each word
vocab = build_vocab_from_iterator(yield_tokens(books), min_freq=100, specials=["<unk>"])
# Since we removed all words with an uppercase when building the vocabulary, we skipped the word "I"
vocab.append_token("i")

# Value of default index. This index will be returned when OOV (Out Of Vocabulary) token is queried.
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)

print("Total number of words in the dataset:   ", len(list_words))
print("Number of distinct words in the dataset:", len(set(list_words)))
print("Size the defined vocabular:             ", vocab_size)


freqs = count_freqs(list_words, vocab)
print("occurences:\n", [(f.item(), w) for (f, w)  in zip(freqs, vocab.lookup_tokens(range(vocab_size)))])

Total number of words in the dataset:    1368807
Number of distinct words in the dataset: 30374
Size the defined vocabular:              1050
occurences:
 [(251055, '<unk>'), (89904, ','), (71106, 'the'), (63121, '.'), (43426, 'and'), (33952, 'to'), (30061, 'of'), (23575, 'a'), (18657, 'in'), (20755, 'he'), (16814, 'that'), (15056, 'was'), (14400, 'his'), (13815, 'it'), (10997, 'with'), (10735, 'had'), (9430, 'her'), (9334, 'not'), (10562, 'you'), (9198, 'as'), (9152, 'at'), (8447, 'him'), (8457, 'is'), (8269, 'for'), (7824, 'on'), (7122, '!'), (6510, '?'), (8087, 'she'), (6477, 's'), (6223, 'be'), (5871, 'said'), (8103, 'but'), (6221, 'all'), (5689, 'have'), (5137, 'from'), (4752, 'which'), (4648, 'me'), (5138, 'so'), (4722, 'by'), (4476, 'were'), (4807, 'my'), (4989, 'this'), (4952, 'they'), (4399, 'one'), (4023, 'who'), (4656, 'what'), (3253, 'up'), (4196, 'there'), (3211, 'them'), (4125, 'we'), (3206, 'would'), (3258, 'an'), (3196, 'are'), (3091, 'been'), (3009, 'or'), (2941, 'out'

In [6]:
def create_dataset(
    text, vocab, context_size=3, 
):
    contexts = []
    targets = []
    n_text = len(text)
    n_vocab = len(vocab)
    
    # Transform the text as a list of integers.
    txt = [vocab[w] for w in text]

    for i in range(n_text - context_size):
        
        # true label = 'is the next word a known word (i.e. not '<unk>' token)?'
        t = int(txt[i + context_size] != 0)
            
        # Context before
        c = txt[i:i + context_size]
            
        targets.append(t)
        # Normally we should use word embedding, and not hot encoding, but we 
        # skip that part for this exercise
        contexts.append(F.one_hot(torch.tensor(c), num_classes=n_vocab))
            
    contexts = torch.stack(contexts)
    targets = torch.tensor(targets)
    return TensorDataset(contexts, targets)

data = create_dataset(list_words, vocab)

Error: Canceled future for execute_request message before replies were done

## 2.2 Conjugating *be* and *have*

## 2.3 Text generation