In [1]:
import spacy

# StringStore

In [2]:
from spacy.strings import StringStore
string_store = StringStore(["apple", "orange"])
print(type(string_store))
print(len(string_store))

<class 'spacy.strings.StringStore'>
2


The basic methods we have available for a `StringStore` are 

In [3]:
[x for x in dir(StringStore) if x.startswith('__') is False]

['_map',
 '_reset_and_load',
 'add',
 'as_int',
 'as_string',
 'from_bytes',
 'from_disk',
 'to_bytes',
 'to_disk']

## `.add`: add a new string 


We can add a new string to a StringStore using `.add`.

When a string is added an integer value is returned, which corresponds to a hash value for the added string.

In [4]:
hash_hello = string_store.add("hello")
print(hash_hello)
print(len(string_store))

5983625672228268878
3


One can retrieve strings from a `StringStore` by their hash value, as if it were a dict.

In [5]:
string_store[hash_hello]

'hello'

## `in`: check if a word is in a `StringStore`

In [6]:
'hello' in string_store

True

##  `.as_int`: hash value of a string

The hash value assigned to a string is provided by `.as_int` 

In [7]:
string_store.as_int("apple")

8566208034543834098

In [8]:
string_store = StringStore(["apple", "orange"])
apple_hash = string_store["apple"]
assert apple_hash == 8566208034543834098
assert string_store[apple_hash] == "apple"

The integer assigned to a word is internaly computed using `spacy.strings.hash_string`

In [9]:
from spacy.strings import hash_string
assert hash_string("apple") == 8566208034543834098

## `.from_bytes`/`.to_bytes`: Load/Store the data from/to bytes

Allows loading/storing a `StringStore` from `bytes` data.

In [10]:
string_store = StringStore(["apple", "orange"])
bytes_string_store = string_store.to_bytes()
string_store_recovered = StringStore()
string_store_recovered.from_bytes(bytes_string_store)

<spacy.strings.StringStore at 0x7faa105b7ea0>

In [11]:
[x for x in string_store_recovered]

['apple', 'orange']

## `.to_disk`: save  to disk

A StringStore can be saved to disk

In [12]:
string_store = StringStore(["apple", "orange"])
string_store.to_disk("string_store.txt")

In [13]:
!cat string_store.txt

[
  "apple",
  "orange"
]

In [14]:
string_store_recovered = StringStore()
string_store_recovered.from_disk('string_store.txt')

<spacy.strings.StringStore at 0x7faa105b4d10>

# `Token`

Token information

- `.text`: The original word text.
- `.lemma_`: The base form of the word.
- `.pos_`: The simple UPOS part-of-speech tag.
- `.tag_`: The detailed part-of-speech tag.
- `.dep_`: Syntactic dependency, i.e. the relation between tokens.
- `.shape_`: The word shape – capitalization, punctuation, digits.
- `.is_alpha`: Is the token an alpha character?
- `.is_stop`: Is the token part of a stop list, i.e. the most common words of the language?

In [15]:
# If spacy.load("en_core_web_sm") does not work, execute 
#!python -m spacy download en_core_web_sm

In [16]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [17]:
doc = nlp('hello there, I am David')
type(doc)

spacy.tokens.doc.Doc

In [18]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

hello hello INTJ UH intj xxxx True False
there there ADV RB advmod xxxx True True
, , PUNCT , punct , False False
I I PRON PRP nsubj X True True
am be AUX VBP ROOT xx True True
David David PROPN NNP attr Xxxxx True False


One can get the lema for each token in the doc

In [19]:
[t.lemma_ for t in doc]

['hello', 'there', ',', 'I', 'be', 'David']

## Identify if a sting is a complete sentence

We can use Part of speech tags to define templates of usable sub-sentences in an application

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [20]:
complete_sentence = nlp('this is cute')
incomplete_sentence = nlp('are high')

In [21]:
for token in complete_sentence:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

this this PRON DT nsubj xxxx True True
is be AUX VBZ ROOT xx True True
cute cute ADJ JJ acomp xxxx True False


In [22]:
from collections import defaultdict

valid_sentences = ['high quality',      # 'great game'
                   'worked great',      # 'looks great'
                   'are high quality',
                   'tend to break',
                   'it tastes good',
                   'very durable',      # 'very cute', 'very comfortable'
                   'breaks easily',     # 'works well', 'works perfectly'
                   'great quality',
                   'well made',
                   'great puzzle',      # 'pretty cards'
                   'very comfortable',
                   'well written',
                   'fun game',
                   'great game',
                   'great book',
                   'looks great',      # 'works great'
                   'love it',
                   ]

def build_POS_prototypes(list_of_valid_items):
    valid_pos_prototypes = []
    POS_prototype_to_examples = defaultdict(list)

    for sentence in valid_sentences:
        s = '  ' + sentence + ' -->'
        sentence = nlp(sentence)
        pos_prototype = []
        for token in sentence:
            s += ' ' + token.pos_
            pos_prototype.append(token.pos_)

        pos_prototype = tuple(pos_prototype)    
        POS_prototype_to_examples[pos_prototype].append(str(sentence))

    return POS_prototype_to_examples

In [23]:
build_POS_prototypes(valid_sentences)

defaultdict(list,
            {('ADJ', 'NOUN'): ['high quality',
              'great quality',
              'great puzzle',
              'great game'],
             ('VERB', 'ADJ'): ['worked great', 'looks great'],
             ('AUX', 'ADJ', 'NOUN'): ['are high quality'],
             ('VERB', 'PART', 'VERB'): ['tend to break'],
             ('PRON', 'VERB', 'ADJ'): ['it tastes good'],
             ('ADV', 'ADJ'): ['very durable', 'very comfortable'],
             ('VERB', 'ADV'): ['breaks easily'],
             ('INTJ', 'VERB'): ['well made'],
             ('ADV', 'VERB'): ['well written'],
             ('NOUN', 'NOUN'): ['fun game'],
             ('PROPN', 'PROPN'): ['great book'],
             ('VERB', 'PRON'): ['love it']})

In [24]:
valid_minimalistic_sentences = set([('ADJ', 'NOUN'),        # high quality
                                    ('VBZ', 'ADJ'),         # worked great
                                    ('VERB', 'ADJ'),        # worked great
                                    ('AUX','ADJ', 'NOUN'),  # are high quality
                                    ('VERB','PART','VERB'), # tend to break
                                    ('PRON','VERB', 'ADJ'), # it tastes good
                                    ('ADV', 'ADJ'),         # very durable
                                    ('VERB', 'ADV'),        # breaks easily
                                    ('ADJ', 'NOUN'),        # great quality
                                    ('INTJ', 'VERB'),       # well made
                                    ('ADV', 'VERB'),        # well written
                                    ('NOUN', 'NOUN'),       # fun game
                                    ('PROPN', 'PROPN'),     # great book
                                    ('VERB', 'PRON'),       # love it
                                   ])

You might need to run the following in the terminal to load `en_core_web_lg`
```
python -m spacy download en_core_web_lg
python -m spacy download en_core_web_sm
```

In [48]:
nlp = spacy.load("en_core_web_lg")

d1 = nlp("smells good")
d2 = nlp("great smell")

In [49]:
d1.similarity(d2)

0.9211558682081242