**SPACY TOKENIZATION**

In [4]:
import spacy

In [5]:
nlp=spacy.blank('en')
doc=nlp('Mr. Padmanabhan loves pav bhaji of mumbai as it costs only 2$ per plate.')
for token in doc:
  print(token)

Mr.
Padmanabhan
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


Creating a blank language object gives a tokenizer and an empty pipeline

In [6]:
nlp.pipe_names

[]

Using index to grab tokens

In [7]:
doc[0]

Mr.

In [8]:
token1=doc[2]
token1.text

'loves'

In [9]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [10]:
type(nlp)

spacy.lang.en.English

In [11]:
type(doc)

spacy.tokens.doc.Doc

In [12]:
type(token)

spacy.tokens.token.Token

**Token Attributes**

In [13]:
doc=nlp('Tony gave two $ to peter')

In [14]:
token=doc[0]
token

Tony

In [15]:

token.is_alpha

True

In [16]:

token2=doc[2]
token2

two

In [17]:
token2.like_num

True

In [18]:
token3=doc[3]
token3

$

In [19]:
token3.is_currency

True

In [20]:
for token in doc:
  print(token,'==>','is_alpha',token.is_alpha,
        'is_punct',token.is_punct,
        'like_num',token.like_num,
        'is_currency',token.is_currency)

Tony ==> is_alpha True is_punct False like_num False is_currency False
gave ==> is_alpha True is_punct False like_num False is_currency False
two ==> is_alpha True is_punct False like_num True is_currency False
$ ==> is_alpha False is_punct False like_num False is_currency True
to ==> is_alpha True is_punct False like_num False is_currency False
peter ==> is_alpha True is_punct False like_num False is_currency False


In [22]:
with open('students.txt')as f:
  text=f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [23]:
text=''.join(text)

In [24]:
text



In [25]:
doc=nlp(text)
emails=[]
for token in doc:
  if token.like_email:
    emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

Customizing Tokenizer

In [26]:
from spacy.symbols import ORTH

In [28]:
nlp=spacy.blank('en')
doc=nlp("gimme double cheese extra large healthy pizza")
tokens=[token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [29]:
nlp.tokenizer.add_special_case('gimme',[
    {ORTH:"gim"},
    {ORTH:"me"},
])



In [30]:
doc=nlp("gimme double cheese extra large healthy pizza")
tokens=[token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [31]:
nlp.pipeline

[]

In [34]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


In [40]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/,
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''


In [42]:
doc=nlp(text)
data_websites=[token.text for token in doc if token.like_url]
data_websites

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [43]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc=nlp(transactions)

Extracting transactions from this text with amount and currency

In [44]:
for token in doc:
  if token.like_num and doc[token.i+1].is_currency:
    print(token.text,doc[token.i+1].text)

two $
500 €
