<a href="https://colab.research.google.com/github/deoprakash/NLP_Tutorial/blob/main/PartOfSpeechTaggingNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [10]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [12]:
doc = nlp("Elon flew to mars yesterday. He carried biryani masala with him.")

for token in doc:
  print(token,"|", token.pos_,"|", token.lemma_, "|", spacy.explain(token.pos_),"|", spacy.explain(token.tag_))

Elon | PROPN | Elon | proper noun | noun, proper singular
flew | VERB | fly | verb | verb, past tense
to | ADP | to | adposition | conjunction, subordinating or preposition
mars | NOUN | mar | noun | noun, plural
yesterday | NOUN | yesterday | noun | noun, singular or mass
. | PUNCT | . | punctuation | punctuation mark, sentence closer
He | PRON | he | pronoun | pronoun, personal
carried | VERB | carry | verb | verb, past tense
biryani | ADJ | biryani | adjective | adjective (English), other noun-modifier (Chinese)
masala | NOUN | masala | noun | noun, singular or mass
with | ADP | with | adposition | conjunction, subordinating or preposition
him | PRON | he | pronoun | pronoun, personal
. | PUNCT | . | punctuation | punctuation mark, sentence closer


In [13]:
earning_report = '''Microsoft Corp. today announced the following results for the quarter ended December 31, 2024, as compared to the corresponding period of last fiscal year:

·         Revenue was $69.6 billion and increased 12%

·         Operating income was $31.7 billion and increased 17% (up 16% in constant currency)

·         Net income was $24.1 billion and increased 10%

·         Diluted earnings per share was $3.23 and increased 10%

“This quarter Microsoft Cloud revenue was $40.9 billion, up 21% year-over-year,” said Amy Hood, executive vice president and chief financial officer of Microsoft. ”We remain committed to balancing operational discipline with continued investments in our cloud and AI infrastructure.”

'''

In [15]:
doc = nlp(earning_report)

for token in doc:
  print(token,"|", token.pos_,"|", token.lemma_, "|", spacy.explain(token.pos_),"|", spacy.explain(token.tag_))

Microsoft | PROPN | Microsoft | proper noun | noun, proper singular
Corp. | PROPN | Corp. | proper noun | noun, proper singular
today | NOUN | today | noun | noun, singular or mass
announced | VERB | announce | verb | verb, past tense
the | DET | the | determiner | determiner
following | VERB | follow | verb | verb, gerund or present participle
results | NOUN | result | noun | noun, plural
for | ADP | for | adposition | conjunction, subordinating or preposition
the | DET | the | determiner | determiner
quarter | NOUN | quarter | noun | noun, singular or mass
ended | VERB | end | verb | verb, past tense
December | PROPN | December | proper noun | noun, proper singular
31 | NUM | 31 | numeral | cardinal number
, | PUNCT | , | punctuation | punctuation mark, comma
2024 | NUM | 2024 | numeral | cardinal number
, | PUNCT | , | punctuation | punctuation mark, comma
as | SCONJ | as | subordinating conjunction | conjunction, subordinating or preposition
compared | VERB | compare | verb | verb,

In [17]:
filtered_token = []

for token in doc:
  if token.pos_ not in ["SPACE", "PUNCT"]:
    filtered_token.append(token)

In [18]:
filtered_token

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2024,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 69.6,
 billion,
 and,
 increased,
 12,
 %,
 Operating,
 income,
 was,
 $,
 31.7,
 billion,
 and,
 increased,
 17,
 %,
 up,
 16,
 %,
 in,
 constant,
 currency,
 Net,
 income,
 was,
 $,
 24.1,
 billion,
 and,
 increased,
 10,
 %,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 3.23,
 and,
 increased,
 10,
 %,
 This,
 quarter,
 Microsoft,
 Cloud,
 revenue,
 was,
 $,
 40.9,
 billion,
 up,
 21,
 %,
 year,
 over,
 year,
 said,
 Amy,
 Hood,
 executive,
 vice,
 president,
 and,
 chief,
 financial,
 officer,
 of,
 Microsoft,
 We,
 remain,
 committed,
 to,
 balancing,
 operational,
 discipline,
 with,
 continued,
 investments,
 in,
 our,
 cloud,
 and,
 AI,
 infrastructure]

In [19]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 9,
 92: 28,
 100: 14,
 90: 4,
 85: 10,
 93: 17,
 97: 20,
 98: 1,
 84: 10,
 103: 10,
 87: 5,
 99: 5,
 89: 6,
 86: 2,
 95: 2}

In [22]:
doc.vocab[99].text

'SYM'

In [23]:
for k, v in count.items():
  print(doc.vocab[k].text, "|", v)

PROPN | 9
NOUN | 28
VERB | 14
DET | 4
ADP | 10
NUM | 17
PUNCT | 20
SCONJ | 1
ADJ | 10
SPACE | 10
AUX | 5
SYM | 5
CCONJ | 6
ADV | 2
PRON | 2


# **Exercise**

In [26]:
news_text = ''' 'Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n\nRemoving volatile food and ene'''


In [28]:
doc = nlp(news_text)
numeral_token = []
noun_token = []

for token in doc:
  if token.pos_ == 'NOUN':
    noun_token.append(token)
  elif token.pos_ == 'NUM':
    numeral_token.append(token)

In [29]:
numeral_token

[8.3, 8.1, 1982]

In [30]:
noun_token

[Inflation,
 climb,
 consumers,
 brink,
 expansion,
 consumer,
 price,
 index,
 measure,
 prices,
 goods,
 services,
 %,
 year,
 estimate,
 %,
 gain,
 ease,
 Marchâ€,
 ™,
 peak,
 level,
 summer,
 food,
 ene]

In [31]:
count = doc.count_by(spacy.attrs.POS)
count

{103: 3,
 97: 10,
 92: 25,
 100: 9,
 86: 4,
 85: 11,
 96: 7,
 90: 12,
 95: 2,
 87: 3,
 89: 4,
 84: 6,
 93: 3,
 94: 1,
 98: 1}

In [32]:
for k,v in count.items():
  print(doc.vocab[k].text, "|",v )

SPACE | 3
PUNCT | 10
NOUN | 25
VERB | 9
ADV | 4
ADP | 11
PROPN | 7
DET | 12
PRON | 2
AUX | 3
CCONJ | 4
ADJ | 6
NUM | 3
PART | 1
SCONJ | 1
