# Word window classification

In [9]:
import torch
import torch.nn as nn
# import pprint
# pp = pprint.PrettyPrinter()
from pprint import pprint

## Data

In [3]:
sentences = [
    "we 'll always have Paris",
    "I live in Germany",
    "He comes from Denmark",
    "The capital of Denmark is Copenhagen"
]
train_sents = [s.lower().split() for s in sentences]
train_labels = [
    [0, 0, 0, 0, 1],
    [0, 0, 0, 1],
    [0, 0, 0, 1],
    [0, 0, 0, 1, 0, 1]
]
assert all(len(sents) == len(labels)
           for sents, labels in zip(train_sents, train_labels))

In [4]:
test_sents = [s.lower().split() for s in ["She comes from Paris"]]
test_labels = [[0, 0, 0, 1]]
assert all(len(sents) == len(labels)
           for sents, labels in zip(test_sents, test_labels))

## Preprocess

### Tokenization

In [5]:
id2word = ["<pad>", "<unk>", "we", "always", "have", "paris",
           "i", "live", "in", "germany",
           "he", "comes", "from", "denmark",
           "the", "of", "is", "copenhagen"]
word2id = dict((w, i) for i,w in enumerate(id2word))

In [8]:
instance = train_sents[0]; instance

['we', "'ll", 'always', 'have', 'paris']

In [7]:
def convert_tokens_to_inds(sentence, word2id):
    unk = word2id['<unk>']
    return [word2id.get(t, unk) for t in sentence]

In [14]:
token_inds = convert_tokens_to_inds(instance, word2id); token_inds

[2, 1, 3, 4, 5]

In [13]:
[id2word[tok_idx] for tok_idx in token_inds]

['we', '<unk>', 'always', 'have', 'paris']

### Padding

In [15]:
def pad_sentence_for_window(sent, window_size, pad_tok='<pad>'):
    return [pad_tok]*window_size + sent + [pad_tok]*window_size

In [16]:
window_size = 2

In [17]:
instance = pad_sentence_for_window(instance, window_size); instance

['<pad>', '<pad>', 'we', "'ll", 'always', 'have', 'paris', '<pad>', '<pad>']

In [21]:
for sent in train_sents:
    toks = pad_sentence_for_window(sent, window_size)
    tok_idxs = convert_tokens_to_inds(toks, word2id)
    print([id2word[tok_idx] for tok_idx in tok_idxs])

['<pad>', '<pad>', 'we', '<unk>', 'always', 'have', 'paris', '<pad>', '<pad>']
['<pad>', '<pad>', 'i', 'live', 'in', 'germany', '<pad>', '<pad>']
['<pad>', '<pad>', 'he', 'comes', 'from', 'denmark', '<pad>', '<pad>']
['<pad>', '<pad>', 'the', '<unk>', 'of', 'denmark', 'is', 'copenhagen', '<pad>', '<pad>']
