# Spacy Tokenizer 

This document is here for me to show basic functionality of spacy as it relates to my project.

In [18]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

import torch
from torchtext.vocab import GloVe, vocab, Vocab

#### Create Tokenizer

In [19]:
nlp = English()
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

#### Examples of Tokenization
Encoding

In [20]:
tokens = tokenizer("This is a sentence")

print(type(tokens))
tokens

<class 'spacy.tokens.doc.Doc'>


This is a sentence

Decoding

In [21]:
decoded_text = " ".join(token.text for token in tokens)

print(type(decoded_text))
print(decoded_text)

<class 'str'>
This is a sentence


### Import Glove Embeddings 

In [26]:
glove = GloVe(name='6B', dim=300)

vocab_list = glove.itos
vocab_emb_mapping_dict = glove.stoi
embeddings = glove.vectors

In [30]:
# Create a PyTorch Vocab object using the token frequencies
pytorch_vocab = vocab(
    ordered_dict=vocab_emb_mapping_dict,
    min_freq=1,
    specials=['<unk>', '<pad>', '<sos>', '<eos>'],
    special_first=True
)

### Load in Spacy english model

Add custom pipeline component with the vocab object we just created.

In [22]:
nlp = spacy.load('en_core_web_sm')

#### Example of tokenization process

In [39]:
unprocessed_text = "This is an example sentence."

# Tokenize the string using spacy
tokens = [token.text.lower() for token in nlp(unprocessed_text)]
print(tokens)

# Map each token to its integer index in the PyTorch Vocab object
token_indices = [pytorch_vocab.get_stoi()[token] for token in tokens]

print(token_indices)

['this', 'is', 'an', 'example', 'sentence', '.']
[40, 17, 32, 883, 2425, 5]
