# **POS Tagging**

In [None]:
import spacy

In [None]:
nlp  = spacy.load("en_core_web_sm")   #english core web sm, its predefined pipeline, POS and other tasks are integrated

In [None]:
text = "Bill gates founded Microsoft."

In [None]:
doc = nlp(text) # this prepares our data in the required format to be passed in the pipleine

In [None]:
doc[0].pos_

'PROPN'

In [None]:
spacy.explain('PROPN') # there are more than 8 POS available in in this pipeline. around 27

'proper noun'

In [None]:
# to see all the POS in our sentence
for token in doc:
    print(token, '=>',token.pos_, '=>',spacy.explain(token.pos_))

Bill => PROPN => proper noun
gates => NOUN => noun
founded => VERB => verb
Microsoft => PROPN => proper noun
. => PUNCT => punctuation


In [None]:
# to see all the types of POS
nlp.get_pipe("tagger").labels

('$',
 "''",
 ',',
 '-LRB-',
 '-RRB-',
 '.',
 ':',
 'ADD',
 'AFX',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'HYPH',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NFP',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 'XX',
 '_SP',
 '``')

In [None]:
type(nlp.get_pipe("tagger").labels) # applying for loop in tuple

tuple

In [None]:
# to know what are all these POS tags
for tag in nlp.get_pipe("tagger").labels:
    print(tag, '=>',spacy.explain(tag))

$ => symbol, currency
'' => closing quotation mark
, => punctuation mark, comma
-LRB- => left round bracket
-RRB- => right round bracket
. => punctuation mark, sentence closer
: => punctuation mark, colon or ellipsis
ADD => email
AFX => affix
CC => conjunction, coordinating
CD => cardinal number
DT => determiner
EX => existential there
FW => foreign word
HYPH => punctuation mark, hyphen
IN => conjunction, subordinating or preposition
JJ => adjective (English), other noun-modifier (Chinese)
JJR => adjective, comparative
JJS => adjective, superlative
LS => list item marker
MD => verb, modal auxiliary
NFP => superfluous punctuation
NN => noun, singular or mass
NNP => noun, proper singular
NNPS => noun, proper plural
NNS => noun, plural
PDT => predeterminer
POS => possessive ending
PRP => pronoun, personal
PRP$ => pronoun, possessive
RB => adverb
RBR => adverb, comparative
RBS => adverb, superlative
RP => adverb, particle
SYM => symbol
TO => infinitival "to"
UH => interjection
VB => verb, 

In [None]:
text = "$1 billion is the price for something"
doc = nlp(text)
for token in doc:
    print(token, '=>',token.pos_, '=>',spacy.explain(token.pos_))

$ => SYM => symbol
1 => NUM => numeral
billion => NUM => numeral
is => AUX => auxiliary
the => DET => determiner
price => NOUN => noun
for => ADP => adposition
something => PRON => pronoun


In [None]:
#evaluation metrics
'''since its an token classification task, clasification metrics are used
  accuracy, precision, recall , confusion metrics,f1'''

'since its an token classification task, clasification metrics are used\n  accuracy, precision, recall , confusion metrics,f1'

# **Named Entity Recognition (NER)**

In [None]:
text = "Bill gates founded Microsoft"

In [None]:
# elon musk is ceo of tesla   >>> here tesla is name of a organisation

# nikola tesla was a great inventor >> here tesla is the name of a person

In [None]:
# apple >> fruit
# apple >> organisation

In [None]:
import spacy

ner = spacy.load("en_core_web_sm") # same inbuilt pipeline is loaded

In [None]:
doc = ner(text)

In [None]:
doc[0]

Bill

In [None]:
doc[0].ents # this didnt worked becuase it work on spans not tokens, see below cells

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'ents'

In [None]:
type(doc[0]) # type is token of spacy

spacy.tokens.token.Token

In [None]:
type(doc[0:3]) #here type is span, means, if we are taking more than one token at a time, than type becomes span

spacy.tokens.span.Span

In [None]:
doc.ents

(Bill, Microsoft)

In [None]:
doc.ents[0].label_

'PERSON'

In [None]:
doc.ents[1].label_

'ORG'

In [None]:
# together in for loop
for ent in doc.ents:
    print(ent, '=>',ent.label_)  # we can clearly see that NER is establised here

Bill => PERSON
Microsoft => ORG


In [None]:
# another method to display the same
from spacy import displacy

In [28]:
displacy.render(doc.ents, style="ent")

In [None]:
displacy.render(doc.ents, style="ent",jupyter=False) # here its in html format with jupyter = false

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Bill\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n </div>\n\n<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Microsoft\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n</div>'

In [None]:
# to see all the entities which it is trained on
ner.get_pipe('ner').labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [None]:
for ent in ner.get_pipe('ner').labels:
  print(ent, '=>',spacy.explain(ent))

CARDINAL => Numerals that do not fall under another type
DATE => Absolute or relative dates or periods
EVENT => Named hurricanes, battles, wars, sports events, etc.
FAC => Buildings, airports, highways, bridges, etc.
GPE => Countries, cities, states
LANGUAGE => Any named language
LAW => Named documents made into laws.
LOC => Non-GPE locations, mountain ranges, bodies of water
MONEY => Monetary values, including unit
NORP => Nationalities or religious or political groups
ORDINAL => "first", "second", etc.
ORG => Companies, agencies, institutions, etc.
PERCENT => Percentage, including "%"
PERSON => People, including fictional
PRODUCT => Objects, vehicles, foods, etc. (not services)
QUANTITY => Measurements, as of weight or distance
TIME => Times smaller than a day
WORK_OF_ART => Titles of books, songs, etc.


In [None]:
# sometimes, only these entities are not enough for our tasks
# like it can miss some words in domain soecific works like medical/finance, they have their own terminologies

text = 'ELon musk is a CEO of Tesla'

doc = ner(text)
for ent in doc.ents:
    print(ent, '=>',ent.label_)  # here only tesla is getting identified as Organisation but not Elon Musk

Tesla => ORG


In [None]:
text = 'We are learning in Velocity'

doc = ner(text)
for ent in doc.ents:
    print(ent, '=>',ent.label_)   # here we can see that it is not able to identify velocity as an ORG but gave wrong as GPE

Velocity => GPE


In [None]:
# so now we need to add Velocity as ORG tagged

### ways to build NER on custom entities

In [None]:
# ways to build NER on custom entities
# 1) dictionary
# tags = {"Velocity":"ORG"} # here problem is that if some Velocity as a Verb comes, than also it will tag it as an ORG, thats Problematic


# 2)Rule Based
# entityruler --> pattern based, similar to Regex

# 3)ML based
# finetuning of spacy3 model, make use of BERT model



## 1. Dictionary based

In [None]:
doc[4:5] # so the position of velocity is at span 4 to 5

Velocity

In [None]:
from spacy.tokens import Span

In [None]:
span1 = Span(doc, 4, 5,label="ORG")#here,we have given the tag(created a span) of ORG to the span of 4:5 that means, position of Velocity

In [None]:
#now  we have to set our entity in the doc
doc.set_ents([span1], default="unmodified") # we can set multiple span and pass in the dict

In [None]:
# now we will run on the same docs created earlier
for ent in doc.ents:
    print(ent, '=>',ent.label_)  # now we can see that its recognizing velocity as an ORG

Velocity => ORG


In [None]:
# this is like a hardcoded, works on specific documents related to specific domains

## 2.Entity Ruler

In [None]:
import spacy

In [None]:
from spacy.pipeline import EntityRuler

In [None]:
ner = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_sm")

In [None]:
text = 'We are learning in Velocity, phone no is 9876543210'
# here we have a pattern of Phone no, we have to identify it as an Phone no

In [None]:
pattern = [{'label':'PHONE','pattern':[{'TEXT':{'REGEX':'\d{10}'}}]}]

In [None]:
ruler = ner.add_pipe("entity_ruler",before="ner")# here we are adding one more component to the ner pipeline loaded above

In [None]:
ruler.add_patterns(pattern) # here we have added our created pattern to the new component added just above

In [None]:
doc = ner(text) # converting into tokens

In [None]:
new_ner = ner(doc)

In [None]:
for ent in new_ner.ents:
    print(ent, '=>',ent.label_)

Velocity => GPE
9876543210 => PHONE


In [None]:
# Applications of Entity ruler
  #pattern based entity tagging


## 3.Finetuning

In [None]:
'''in our txt file, we have COVID-19 , a medical term, which we need to identify under NER.
For that reason, we have to label it
'''

In [1]:
'''Labelling tools
    1. Prodi.gy --> paid tool
    2.https://arunmozhi.in/ner-annotator/ --> open source
        we have to do labelling manually , visited this site, uploaded my text file, created labels,
        tagged them manually one by one, word by word. For ex, We created a Label named as VIRUS,
        and tagged COVID-19 as virus, if it is present 100 times, we have to do it manually 100 times.

        So by doing this manually , we saved ot and then exported it, after that a json file was downloaded
        We will be using the same downloaded file over here.Its available in the file folder left side.
        file name is "annotations.json"

'''

'Labelling tools\n    1. Prodi.gy --> paid tool\n    2.https://arunmozhi.in/ner-annotator/ --> open source\n        we have to do labelling manually , visited this site, uploaded my text file, created labels,\n        tagged them manually one by one, word by word. For ex, We created a Label named as VIRUS,\n        and tagged COVID-19 as virus, if it is present 100 times, we have to do it manually 100 times.\n\n'

In [2]:
import spacy
from spacy.tokens import DocBin   #  DocBin takes care of the format of the data for training

In [3]:
nlp = spacy.blank("en")  # blank pipeline
db = DocBin()  # empty db

In [5]:
import json

f = open("annotations.json",encoding="utf8")

TRAIN_DATA = json.load(f)

In [6]:
TRAIN_DATA

{'classes': ['VIRUS', 'DIESESE'],
 'annotations': [["The symptoms of COVID‑19 can vary but often include fever,[7] fatigue, cough, breathing difficulties, loss of smell, and loss of taste.[8][9][10] Symptoms may begin one to fourteen days after exposure to the virus. At least a third of people who are infected do not develop noticeable symptoms.[11][12] Of those who develop symptoms noticeable enough to be classified as patients, most (81%) develop mild to moderate symptoms (up to mild pneumonia), while 14% develop severe symptoms (dyspnea, hypoxia, or more than 50% lung involvement on imaging), and 5% develop critical symptoms (respiratory failure, shock, or multiorgan dysfunction).[13] Older people have a higher risk of developing severe symptoms. Some complications result in death. Some people continue to experience a range of effects (long COVID) for months or years after infection, and damage to organs has been observed.[14] Multi-year studies on the long-term effects are ongoing.

In [16]:
for text,annot in TRAIN_DATA['annotations']:
  doc = nlp.make_doc(text)
  ents = []
  for start,end,label in annot["entities"]:
    #eg --> [16, 24, 'VIRUS'] = [start,end,label] | 'contract' removes the white spaces in starting/ending if any
    span = doc.char_span(start,end,label,alignment_mode='contract')
    if span is None:
      print('none')
    else:
      ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk('training_data.spacy')

# now our dataset is ready with named 'training_data.spacy'

none


In [11]:
# Training

# for training, we need a config file in which all the parameters required for training is given

# we will be using a CLI command for training, as mostly these training happens on cloud servers
 # and there we mostly use CLI for suc tasks

 #if we have created this dataset (training_data.spacy), than if using server, we have to upload
 #it there and run the below two CLI command in terminal for starting the training
#after running only these two commands, the model will start its training




In [13]:
# command for downloading the config file
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

#after running this command, config file is downloaded, it have all the parameters required for training

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [17]:
#command for training after downloading the config file
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    210.50    0.00    0.00    0.00    0.00
  9     200       4692.78   6673.07   73.01   75.00   71.12    0.73
 18     400         99.74   1897.46   75.00   70.45   80.17    0.75
 27     600         97.67   1832.68   65.80   82.47   54.74    0.66
 36     800         81.39   1787.76   74.68   73.14   76.29    0.75
 45    1000         77.55   1761.96   73.45   75.45   71.55    0.73
 54    1200         64.78   1711.56   75.40   70.83   80.60    0.75
 63    1400         67.29   1692.38   71.63   77.78   66.38    0.72
 72    1600         68.97   1681.02   65.80   82.47   54.74    0.66
 81    1800         91.44   1701.87   74.68   73.14

In [18]:
# we can see that two models are saved, model-best and model-last
  #model-best --> model with best accuracy
  #model-last --> model with last iteration

In [None]:
#Deployment of this in Production
  #for deployment, we would be needing the whole folder
  #we will download it and upload it on the deployment server

In [19]:
## now making inferences on the trained model

In [20]:
trained_ner = spacy.load("model-best")  # same process which we did earlier for prebuilt spacy model

In [21]:
text = "COVID‑19 caused damage"

doc = trained_ner(text)

In [22]:
doc.ents # its not showing anything as the dataset was too small, but for larger data, it will work

()

## Transformers for NER

In [23]:
!pip install transformers



In [24]:
from transformers import pipeline

In [25]:
ner_transformer = pipeline(model = "dslim/bert-base-NER-uncased")

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [26]:
text = "bill gates founded microsoft"
ner_transformer(text)

[{'entity': 'B-PER',
  'score': np.float32(0.99582714),
  'index': 1,
  'word': 'bill',
  'start': 0,
  'end': 4},
 {'entity': 'I-PER',
  'score': np.float32(0.99281466),
  'index': 2,
  'word': 'gates',
  'start': 5,
  'end': 10},
 {'entity': 'B-ORG',
  'score': np.float32(0.98988754),
  'index': 4,
  'word': 'microsoft',
  'start': 19,
  'end': 28}]

In [None]:
"""
[{'entity': 'B-PER',
  'score': np.float32(0.99582714),
  'index': 1,
  'word': 'bill',
  'start': 0,
  'end': 4},
 {'entity': 'I-PER',
  'score': np.float32(0.99281466),
  'index': 2,
  'word': 'gates',
  'start': 5,
  'end': 10},
 {'entity': 'B-ORG',
  'score': np.float32(0.98988754),
  'index': 4,
  'word': 'microsoft',
  'start': 19,
  'end': 28}]
"""

# here we can see that the given entities are like "B-PER","I-PER","B-ORG"
  #this is a format of tagging named as IOB format
  # I --> inside
  # o --> outside
  # B --> begining

  # bill --> B-PER --> begining of the name of the person
  # gates --> I-PER -->inside the name of the person
  # founded --> this was not tagged as any we can see above, so it is assigned as outside

In [27]:
text = " narayan murthy founded infosys"
ner_transformer(text)
# earlier we saw that model was not able to identify this text but with Transformer , it is able to do

[{'entity': 'B-PER',
  'score': np.float32(0.99548846),
  'index': 1,
  'word': 'narayan',
  'start': 1,
  'end': 8},
 {'entity': 'I-PER',
  'score': np.float32(0.9973943),
  'index': 2,
  'word': 'mu',
  'start': 9,
  'end': 11},
 {'entity': 'I-PER',
  'score': np.float32(0.984729),
  'index': 3,
  'word': '##rth',
  'start': 11,
  'end': 14},
 {'entity': 'I-PER',
  'score': np.float32(0.98603475),
  'index': 4,
  'word': '##y',
  'start': 14,
  'end': 15},
 {'entity': 'B-ORG',
  'score': np.float32(0.99705446),
  'index': 6,
  'word': 'info',
  'start': 24,
  'end': 28},
 {'entity': 'I-ORG',
  'score': np.float32(0.9965019),
  'index': 7,
  'word': '##sy',
  'start': 28,
  'end': 30},
 {'entity': 'I-ORG',
  'score': np.float32(0.9972197),
  'index': 8,
  'word': '##s',
  'start': 30,
  'end': 31}]