### <center>Tokenization | Using Regex</center>

#### 1. Import the libs

In [1]:
import re

#### 2. Split by whitespaces

In [3]:
#Split by Whitespace

text = 'I\'m with you for the entire life in U.K.!'
words = re.split(r'\W+', text)

print(words[:100])

['I', 'm', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U', 'K', '']


#### 3. Remove punctuation

In [4]:
import string
import re

#split into words by white space
words = text.split()

# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))

# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in words]

print(stripped[:100])

['Im', 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'UK']


#### 4. String punctuation inverse

In [5]:
# string.printable inverse of string.punctuation

re_print = re.compile('[^%s]' % re.escape(string.printable))
result = [re_print.sub('', w) for w in words]

print(result)

["I'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'U.K.!']


#### 5. Normalization 

In [6]:
# split into words by white space
words = text.split()

# convert to lower case
words = [word.lower() for word in words]

print(words[:100])

["i'm", 'with', 'you', 'for', 'the', 'entire', 'life', 'in', 'u.k.!']


### <center>Tokenization | Using Spacy </center>

#### 6. Working with spacy

#### 1. Install the spacy lib

In [7]:
!pip install -U spacy

Collecting spacy
  Downloading spacy-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.1/130.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m[31m1.9 MB/s[0m eta [36m0:00:01[0m
[?25hCollecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.2-py3-none-any.whl (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[2K     [

#### 2. Loading the english module

In [10]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0-py3-none-any.whl (587.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:03[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


#### 3. Import and load the module

In [12]:
import spacy
nlp = spacy.load('en_core_web_lg')

#### 4. Apply the tokenization

In [33]:
string = '"I\'m with you for the entire life in P.K.!"'
print(string)

"I'm with you for the entire life in P.K.!"


#### 5. Using the end pipe as a seperator

In [34]:
doc = nlp(string)
for token in doc:
    print(token.text, end=' | ')

" | I | 'm | with | you | for | the | entire | life | in | P.K. | ! | " | 

#### 6. Parsing the email 

In [35]:
doc2 = nlp(u"We're here to help! Send snail-mail, email fahad@gmail.com or visit us at https://fahadhussaincs.blogspot.com/!")
for t in doc2:
    print(t)

We
're
here
to
help
!
Send
snail
-
mail
,
email
fahad@gmail.com
or
visit
us
at
https://fahadhussaincs.blogspot.com/
!


#### 7. Parsing the special symbol's

In [36]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


#### 8. Example 

In [37]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


#### 9. Compute the length

In [38]:
len(doc)

13

In [39]:
len(doc.vocab)

845

In [40]:
doc5 = nlp(u'It is better to give than to receive.')
# Retrieve the third token:
doc5[2]

better

In [41]:
# Retrieve three tokens from the middle:
doc5[2:5]

better to give

In [42]:
# Retrieve the last four tokens:
doc5[-4:]

than to receive.

In [23]:
doc6 = nlp(u'My dinner was horrible.')
doc7 = nlp(u'Your dinner was delicious.')

In [24]:
# Try to change "My dinner was horrible" to "My dinner was delicious"
doc6[3] = doc7[3]

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

In [43]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

for token in doc8:
    print(token.text, end=' | ')

print('\n----')

for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Hong Kong - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [44]:
len(doc8.ents)

3

#### 10. Example 2

In [45]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [46]:
doc10 = nlp(u"Red cars do not carry higher insurance rates.")

for chunk in doc10.noun_chunks:
    print(chunk.text)

Red cars
higher insurance rates


In [47]:
doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")

for chunk in doc11.noun_chunks:
    print(chunk.text)

He
a one-eyed, one-horned, flying, purple people-eater


#### 11. Displacy

In [30]:
from spacy import displacy

doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

In [31]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

In [32]:
doc = nlp(u'This is a sentence.')
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [29/Jul/2022 12:39:52] "GET / HTTP/1.1" 200 3395
127.0.0.1 - - [29/Jul/2022 12:39:53] "GET /favicon.ico HTTP/1.1" 200 3395


Shutting down server on port 5000.
