In [28]:
pip install spacy


Note: you may need to restart the kernel to use updated packages.


In [29]:
import spacy

In [30]:
nlp = spacy.blank("en")

doc = nlp("Let's go to ottawa.")
for token in doc:
    print(token)

Let
's
go
to
ottawa
.


In [31]:
doc = nlp("Dr. strange visited two cities, first delhi, and then mumbai.")

for token in doc:
    print(token)

Dr.
strange
visited
two
cities
,
first
delhi
,
and
then
mumbai
.


In [32]:
type(nlp)

spacy.lang.en.English

In [33]:
type(doc)

spacy.tokens.doc.Doc

In [34]:
type(doc[0])

spacy.tokens.token.Token

In [35]:
dir(doc[0])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [36]:
doc[0].is_alpha

False

In [37]:
doc[0].like_num

False

In [38]:
doc[3]

two

In [51]:
doc[3].like_num

False

## Extract emails from a text

In [45]:
with open("students.txt") as f:
    text = f.readlines()
    
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n']

In [55]:
text = ''.join(text)
text



In [56]:
doc = nlp(text)

In [59]:
emails = []
for token in doc:
    if token.like_email:
        emails.append(token)

In [60]:
emails

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]

In [61]:
nlpHindi = spacy.blank("hi")
doc = nlpHindi("कैसे हो भ्ऐ, मेइन सोने ज रह हुन्")

for token in doc:
    print(token)

कैसे
हो
भ्ऐ
,
मेइन
सोने
ज
रह
हुन्


### Customize Tokenizer - adding special rule in tokenizer

In [64]:
doc = nlp("gimme double cheese extra large burger")

tokens = [token for token in doc]

In [65]:
tokens

[gimme, double, cheese, extra, large, burger]

In [68]:
from spacy.symbols import ORTH

# customization does not allow gimme --> give, me (tokens cant be modified)
# customization can allow to break token into multiple tokens
nlp.tokenizer.add_special_case("gimme",[
    {ORTH: "gim"},
    {ORTH: "me"}
])

In [69]:
doc = nlp("gimme double cheese extra large burger")

tokens = [token for token in doc]
tokens

[gim, me, double, cheese, extra, large, burger]

### Sentence Tokenization

In [84]:
nlp = spacy.blank("en")

doc = nlp("Dr. strange is a wizard. Hulk is a mutant.")

In [85]:
doc

Dr. strange is a wizard. Hulk is a mutant.

In [86]:
for sent in doc.sents:
    print(sent)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [87]:
nlp.pipe_names

[]

### Since, a blank pipeline was created there are no components present in the nlp pipeline.
### Hince, we need to add a component to the pipeline

In [88]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fd7926f7d40>

In [89]:
nlp.pipe_names

['sentencizer']

In [90]:
doc = nlp("Dr. strange is a wizard. Hulk is a mutant.")
for sent in doc.sents:
    print(sent)

Dr. strange is a wizard.
Hulk is a mutant.
