<a href="https://colab.research.google.com/github/chitreshkr/Natural-Language-Processing-Python/blob/main/Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip uninstall spacy


Uninstalling spacy-2.2.4:
  Would remove:
    /usr/local/bin/spacy
    /usr/local/lib/python3.7/dist-packages/bin/*
    /usr/local/lib/python3.7/dist-packages/spacy-2.2.4.dist-info/*
    /usr/local/lib/python3.7/dist-packages/spacy/*
  Would not remove (might be manually added):
    /usr/local/lib/python3.7/dist-packages/bin/theano_cache.py
    /usr/local/lib/python3.7/dist-packages/bin/theano_nose.py
Proceed (y/n)? y
  Successfully uninstalled spacy-2.2.4


In [2]:
!pip install spacy==2.2.4

Collecting spacy==2.2.4
[?25l  Downloading https://files.pythonhosted.org/packages/37/ff/2a7c89f2069173a1ecbccd95d2a23fc42f89045b33f8a71ef57b360a3de4/spacy-2.2.4-cp37-cp37m-manylinux1_x86_64.whl (10.6MB)
[K     |████████████████████████████████| 10.6MB 5.6MB/s 
Installing collected packages: spacy
Successfully installed spacy-2.2.4


In [3]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0MB)
[K     |████████████████████████████████| 12.0MB 332kB/s 
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-cp37-none-any.whl size=12019125 sha256=03865b1369087b038da77778cbbfe65b2e7106d72e8662d3fc395c3a3147c8c1
  Stored in directory: /root/.cache/pip/wheels/48/5c/1c/15f9d02afc8221a668d2172446dd8467b20cdb9aef80a172a4
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-2.2.0


In [4]:
import spacy

In [6]:
nlp = spacy.load('en_core_web_sm')

In [8]:
#help(nlp)

In [9]:
text = "Apple is looking for buying UK startup for 1 billion dollars"

In [10]:
doc = nlp(text)

In [12]:
#help(doc)

In [13]:
for token in doc:
  print(token.text)

Apple
is
looking
for
buying
UK
startup
for
1
billion
dollars


# Parts of Speech Tagging

In [19]:
for token in doc:
  print(f'{token.text:{15}} {token.pos_}')

Apple           PROPN
is              AUX
looking         VERB
for             ADP
buying          VERB
UK              PROPN
startup         NOUN
for             ADP
1               NUM
billion         NUM
dollars         NOUN


# Visuaization

In [20]:
from spacy import displacy

In [30]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 120})

In [29]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 120,'compact':True})

# Named Entity Recognition

In [31]:
for ent in doc.ents:
  print(ent.text,ent.label_)

Apple ORG
UK GPE
1 billion dollars MONEY


In [32]:
displacy.render(doc, style='ent', jupyter=True, options={'distance': 120})

# Sentence Segmentation

In [39]:
text = 'Apple is looking for buying a Uk startup in 2020. The government has given permission.'

In [40]:
doc = nlp(text)

In [41]:
for sent in doc.sents:
  print(sent)
  print()

Apple is looking for buying a Uk startup in 2020.

The government has given permission.



# Rule Phrase Matching

In [62]:
from spacy.matcher import Matcher
from spacy.tokens import span

In [73]:
text = 'Hello, Chitresh! hello chitresh'

In [74]:
doc = nlp(text)

In [75]:
for token in doc:
  print(token)

Hello
,
Chitresh
!
hello
chitresh


In [77]:
pattern = [{'LOWER':'hello'},{"IS_PUNCT": True, "OP" : "?"},{'LOWER':'chitresh'}]

In [91]:
#https://stackoverflow.com/questions/58519650/spacy-rule-based-phrase-matching-for-hello-world

In [78]:
matcher = Matcher(nlp.vocab)
matcher.add('hw', None , pattern)

In [79]:
matches = matcher(doc)

In [80]:
matches

[(17790654416186116455, 0, 3), (17790654416186116455, 4, 6)]

In [81]:
for match_id,start,end in matches:
  string_id = nlp.vocab.strings[match_id]
  span = doc[start:end]
  print(match_id,string_id,start,end,span.text)

17790654416186116455 hw 0 3 Hello, Chitresh
17790654416186116455 hw 4 6 hello chitresh


# Regular Expression

In [82]:
text = 'my mobile number is 1224.The correct one is 1234566879.'

In [83]:
import re

In [85]:
re.search(r'\d{3}',text)

<re.Match object; span=(20, 23), match='122'>

In [87]:
re.findall(r'\d{3}',text)

['122', '123', '456', '687']

In [88]:
re.findall(r'\d{3,10}',text)

['1224', '1234566879']

In [90]:
re.findall(r'\w+',text)

['my',
 'mobile',
 'number',
 'is',
 '1224',
 'The',
 'correct',
 'one',
 'is',
 '1234566879']

# WildCard and Exclusion Matching

In [92]:
text

'my mobile number is 1224.The correct one is 1234566879.'

In [94]:
re.findall(r'mo....',text)

['mobile']

In [97]:
re.findall(r'c.....+',text)

['correct one is 1234566879.']

In [98]:
re.findall(r'o.e',text)

['one']

In [99]:
re.findall(r'2.',text)

['22', '23']

In [100]:
re.findall(r'2.+',text)

['224.The correct one is 1234566879.']

In [103]:
re.findall(r'[^\d]+',text)

['my mobile number is ', '.The correct one is ', '.']

In [104]:
re.findall(r'[^\D]+',text)

['1224', '1234566879']

# Processing Pipeline in Spacy

In [117]:
text = ['net income was $9.4 million compared to the prior year of $2.7 million.','revenue exceeds twelve billion dollars with a loss of 1b dollars']

In [123]:
%%timeit
docs = nlp.pipe(text,disable=['tagger','parser'])
for doc in docs:
  for ent in doc.ents:
    print(ent.text,ent.label_)
  print()

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 mi

In [122]:
%%timeit

docs = nlp.pipe(text)
for doc in docs:
  for ent in doc.ents:
    print(ent.text,ent.label_)
  print()

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY

twelve billion dollars MONEY
1b dollars MONEY

$9.4 million MONEY
the prior year DATE
$2.7 mi