In [15]:
import spacy
import string

from spacy.tokens import Token

import torch
from torch import nn

from news_vec.cuda import itype, ftype

In [2]:
CURLY_STRAIGHT = (('“', '"'), ('”', '"'), ('‘', "'"), ('’', "'"))

def straighten_quotes(text):
    """Curly -> straight.
    """
    for c, s in CURLY_STRAIGHT:
        text = text.replace(c, s)

    return text

In [3]:
def clean_clf_token(token):
    return straighten_quotes(token.text.lower())

In [4]:
Token.set_extension('clf_text', getter=clean_clf_token, force=True)

In [5]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [6]:
doc = nlp('AP ALERT: “President” Trump leaves office')

In [7]:
' '.join([t._.clf_text for t in doc])

'ap alert : " president " trump leaves office'

In [20]:
class CharEmbedding(nn.Embedding):

    def __init__(self, embed_dim=15):
        """Set vocab, map s->i.
        """
        self.vocab = (
            string.ascii_letters +
            string.digits +
            string.punctuation
        )

        # <PAD> -> 0, <UNK> -> 1
        self._ctoi = {s: i+2 for i, s in enumerate(self.vocab)}

        super().__init__(len(self.vocab)+2, embed_dim)

    def ctoi(self, c):
        return self._ctoi.get(c, 1)

    def chars_to_idxs(self, chars):
        """Map characters to embedding indexes.
        """
        idxs = [self.ctoi(c) for c in chars]

        return torch.LongTensor(idxs).type(itype)

    def forward(self, texts):
        """Batch-embed token chars.

        Args:
            texts (list<str>)
        """
        # Map chars -> indexes.
        x = torch.stack([self.chars_to_idxs(t) for t in texts])

        return super().forward(x)

In [21]:
ce = CharEmbedding()

In [23]:
ce(['hl1', 'hl2']).shape

torch.Size([2, 3, 15])