<a href="https://colab.research.google.com/github/dinithprimal/NLP_Tutorials/blob/main/EX03_Tokenization_in_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy

In [3]:
nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves koththu roti of colombo as it cost 1.5$ per plate.")

for token in doc:
  print(token)

Dr.
Strange
loves
koththu
roti
of
colombo
as
it
cost
1.5
$
per
plate
.


In [4]:
type(nlp)

In [5]:
type(doc)

spacy.tokens.doc.Doc

In [6]:
type(token)

spacy.tokens.token.Token

In [7]:
span = doc[1:5]
type(span)

spacy.tokens.span.Span

In [9]:
doc = nlp("Tony gave two $ to peter.")

In [10]:
token0 = doc[0]
token0

Tony

In [11]:
dir(token0)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [12]:
token0.is_alpha

True

In [13]:
token0.like_num

False

In [14]:
token2 = doc[2]
token2.text

'two'

In [15]:
token2.like_num

True

In [16]:
token3 = doc[3]
token3

$

In [17]:
token3.is_currency

True

In [22]:
for token in doc:
  print(
      token, "==>", "index: ", token.i,
      ",\t is_alpha: ", token.is_alpha,
      ",\t is_punct: ", token.is_punct,
      ",\t like_num: ", token.like_num,
      ",\t is_currency: ", token.is_currency,
  )

Tony ==> index:  0 ,	 is_alpha:  True ,	 is_punct:  False ,	 like_num:  False ,	 is_currency:  False
gave ==> index:  1 ,	 is_alpha:  True ,	 is_punct:  False ,	 like_num:  False ,	 is_currency:  False
two ==> index:  2 ,	 is_alpha:  True ,	 is_punct:  False ,	 like_num:  True ,	 is_currency:  False
$ ==> index:  3 ,	 is_alpha:  False ,	 is_punct:  False ,	 like_num:  False ,	 is_currency:  True
to ==> index:  4 ,	 is_alpha:  True ,	 is_punct:  False ,	 like_num:  False ,	 is_currency:  False
peter ==> index:  5 ,	 is_alpha:  True ,	 is_punct:  False ,	 like_num:  False ,	 is_currency:  False
. ==> index:  6 ,	 is_alpha:  False ,	 is_punct:  True ,	 like_num:  False ,	 is_currency:  False


## Read and Extract Data from a Text File

In [23]:
with open("students.txt") as f:
  text = f.readlines()

text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [24]:
text = ' '.join(text)
text



In [25]:
doc = nlp(text)

emails = []
for token in doc:
  if token.like_email:
    emails.append(token.text)

emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

## Customization Tokenization Rule

In [26]:
doc = nlp("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [27]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}
])

doc = nlp("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

## Sentence Tokenization (Segmentation)

In [28]:
doc = nlp("Dr. Strange loves koththu roti of colombo. Hulk loves parata of Matara")

for sentence in doc.sents:
  print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [30]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7fb9aba89840>

In [31]:
nlp.pipe_names

['sentencizer']

In [32]:
doc = nlp("Dr. Strange loves koththu roti of colombo. Hulk loves parata of Matara")

for sentence in doc.sents:
  print(sentence)

Dr. Strange loves koththu roti of colombo.
Hulk loves parata of Matara
