<a href="https://colab.research.google.com/github/gloriaconcepto/ML-NLP-LEARNING/blob/main/Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
s1 = 'Apple is looking at buying U.K. startup for $1 billion !'
s2 = 'Hello all, We are here to help you! email support@udemy.com or visit us at http://www.udemy.com!'
s3 = '10km cab ride almost costs $20 in NYC'
s4 = "Let's watch a movie together."

In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
document1=nlp(s1)


In [5]:
print(document1)
for token in document1:
   print(token)

Apple is looking at buying U.K. startup for $1 billion !
Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion
!


In [6]:
doc2 = nlp(s2)
print(s2)
for token in doc2:
  print(token)

Hello all, We are here to help you! email support@udemy.com or visit us at http://www.udemy.com!
Hello
all
,
We
are
here
to
help
you
!
email
support@udemy.com
or
visit
us
at
http://www.udemy.com
!


`Stemming and Lemmatization`

In [7]:
import nltk

In [8]:
words = ['run','runner','running','ran','runs','easily','fairly']

In [9]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import  SnowballStemmer

In [10]:
p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language='english')

In [11]:
for word in words:
  print(word + ' ------  ' +p_stemmer.stem(word))

run ------  run
runner ------  runner
running ------  run
ran ------  ran
runs ------  run
easily ------  easili
fairly ------  fairli


In [12]:
for word in words:
  print(word + ' ------  ' +s_stemmer.stem(word))

run ------  run
runner ------  runner
running ------  run
ran ------  ran
runs ------  run
easily ------  easili
fairly ------  fair


### Lemmatization

In [13]:
doc1 = nlp("The striped bats are hanging on their feet for best")

In [14]:
for token in doc1:
  print(token.text, '\t', token.lemma_)

The 	 the
striped 	 stripe
bats 	 bat
are 	 be
hanging 	 hang
on 	 on
their 	 their
feet 	 foot
for 	 for
best 	 good


#  Rule-Based Matching


In [15]:
# Import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab) # created matcher object and pass nlp.vocab

In [16]:
pattern_1 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_2 = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]

In [20]:
matcher.add('Hello World',[pattern_1, pattern_2])

In [22]:
# create a document
doc = nlp(" 'Hello World' are the first two printed words for most of the programmers, printing 'Hello-World' is most common for beginners")


In [23]:
find_matches = matcher(doc) # passin doc to matcher object and store this in a variable
print(find_matches)

[(8585552006568828647, 2, 4), (8585552006568828647, 19, 22)]


In [24]:
# define a function to find the matches

for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8585552006568828647 Hello World 2 4 Hello World
8585552006568828647 Hello World 19 22 Hello-World


# 2 Phrase Matching

In [25]:
from spacy.matcher import PhraseMatcher
matcher_phrase = PhraseMatcher(nlp.vocab)

In [26]:
phrase_list = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

In [27]:
# Convert each phrase to a document object
phrase_patterns = [nlp(text) for text in phrase_list] # to do that we are using list comprehension

In [28]:
# pass each doc object into the matcher
matcher_phrase.add("TerminologyList", None, *phrase_patterns)

In [29]:
doc_3 = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")

In [30]:
# define a function to find the matches

for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc_3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8585552006568828647 Hello World 2 4 Angela Merkel
8585552006568828647 Hello World 19 22 Washington, D.C.
