In [40]:
import spacy

# StringStore

In [92]:
from spacy.strings import StringStore
string_store = StringStore(["apple", "orange"])
print(type(stringstore))
print(len(stringstore))

<class 'spacy.strings.StringStore'>
3


The basic methods we have available for a `StringStore` are 

In [93]:
[x for x in dir(StringStore) if x.startswith('__') is False]

['_map',
 '_reset_and_load',
 'add',
 'as_int',
 'as_string',
 'from_bytes',
 'from_disk',
 'to_bytes',
 'to_disk']

## `.add`: add a new string 


We can add a new string to a StringStore using `.add`.

When a string is added an integer value is returned, which corresponds to a hash value for the added string.

In [102]:
hash_hello = string_store.add("hello")
print(hash_hello)
print(len(stringstore))

5983625672228268878
3


One can retrieve strings from a `StringStore` by their hash value, as if it were a dict.

In [105]:
string_store[hash_hello]

'hello'

## `in`: check if a word is in a `StringStore`

In [108]:
'hello' in string_store

True

##  `.as_int`: hash value of a string

The hash value assigned to a string is provided by `.as_int` 

In [100]:
string_store.as_int("apple")

8566208034543834098

In [112]:
string_store = StringStore(["apple", "orange"])
apple_hash = string_store["apple"]
assert apple_hash == 8566208034543834098
assert string_store[apple_hash] == "apple"

The integer assigned to a word is internaly computed using `spacy.strings.hash_string`

In [113]:
from spacy.strings import hash_string
assert hash_string("apple") == 8566208034543834098

## `.from_bytes`/`.to_bytes`: Load/Store the data from/to bytes

Allows loading/storing a `StringStore` from `bytes` data.

In [149]:
string_store = StringStore(["apple", "orange"])
bytes_string_store = string_store.to_bytes()
string_store_recovered = StringStore()
string_store_recovered.from_bytes(bytes_string_store)

<spacy.strings.StringStore at 0x7ff281ead180>

In [151]:
[x for x in string_store_recovered]

['apple', 'orange']

## `.to_disk`: save  to disk

A StringStore can be saved to disk

In [156]:
string_store = StringStore(["apple", "orange"])
string_store.to_disk("string_store.txt")

In [157]:
!cat string_store.txt

[
  "apple",
  "orange"
]

In [158]:
string_store_recovered = StringStore()
string_store_recovered.from_disk('string_store.txt')

<spacy.strings.StringStore at 0x7ff281eafef0>

## `Token`

In [160]:
# Construction 1
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)


In [164]:
doc = nlp('hello there, I am David')

In [166]:
type(doc)

spacy.tokens.doc.Doc

In [175]:
type(doc[0])

spacy.tokens.token.Token

In [174]:
help(spacy.tokens.token.Token)

Help on class Token in module spacy.tokens.token:

class Token(builtins.object)
 |  An individual token – i.e. a word, punctuation symbol, whitespace,
 |  etc.
 |  
 |  DOCS: https://spacy.io/api/token
 |  
 |  Methods defined here:
 |  
 |  __bytes__(...)
 |      Token.__bytes__(self)
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __hash__(self, /)
 |      Return hash(self).
 |  
 |  __le__(self, value, /)
 |      Return self<=value.
 |  
 |  __len__(...)
 |      The number of unicode characters in the token, i.e. `token.text`.
 |      
 |      RETURNS (int): The number of unicode characters in the token.
 |      
 |      DOCS: https://spacy.io/api/token#len
 |  
 |  __lt__(self, value, /)
 |      Return self<value.
 |  
 |  __ne__(self, value, /)
 |      Return self!=value.
 |  
 |  __reduce__(...)
 |      Token.__reduce__(self)
 |  
 |  __repr_