In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /Users/morgan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."
text

"Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

In [4]:
sentences = sent_tokenize(text)
sentences

['Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry.',
 'The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066.',
 'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace.',
 "Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."]

In [5]:
# Punctuation removal
import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]", " ", sentences[2]) 
text


'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace '

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
words = word_tokenize(text)
print(words)

['Queen', 'Camilla', 'was', 'crowned', 'alongside', 'him', 'before', 'a', 'huge', 'parade', 'back', 'to', 'Buckingham', 'Palace']


In [8]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/morgan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
words = [w for w in words if w not in stopwords.words("english")]
print(words)

['Queen', 'Camilla', 'crowned', 'alongside', 'huge', 'parade', 'back', 'Buckingham', 'Palace']


In [10]:
nltk.download('wordnet') # download for lemmatization
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/morgan/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/morgan/nltk_data...


True

In [11]:
# Stemming
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['queen', 'camilla', 'crown', 'alongsid', 'huge', 'parad', 'back', 'buckingham', 'palac']


In [13]:
# Lemmatize
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmatized)

['Queen', 'Camilla', 'crowned', 'alongside', 'huge', 'parade', 'back', 'Buckingham', 'Palace']


In [14]:
# Another stemming and lemmatization example
words2 = ['wait', 'waiting' , 'studies', 'studying', 'computers']

# Stemming
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words2]
print("Stemming output: {}".format(stemmed))

# Lemmatization
# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words2]
print("Lemmatization output: {}".format(lemmatized))

Stemming output: ['wait', 'wait', 'studi', 'studi', 'comput']
Lemmatization output: ['wait', 'waiting', 'study', 'studying', 'computer']


In [15]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/morgan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/morgan/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [16]:
from nltk import pos_tag

In [17]:
pos_tag(words)

[('Queen', 'NNP'),
 ('Camilla', 'NNP'),
 ('crowned', 'VBD'),
 ('alongside', 'RB'),
 ('huge', 'JJ'),
 ('parade', 'NN'),
 ('back', 'RB'),
 ('Buckingham', 'NNP'),
 ('Palace', 'NNP')]

In [18]:
from nltk import ne_chunk
nltk.download('words')


[nltk_data] Downloading package words to /Users/morgan/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [19]:
ner_tree = ne_chunk(pos_tag(word_tokenize(sentences[2])))
print(ner_tree)

(S
  (PERSON Queen/NNP)
  (PERSON Camilla/NNP)
  was/VBD
  crowned/VBN
  alongside/RB
  him/PRP
  before/IN
  a/DT
  huge/JJ
  parade/NN
  back/RB
  to/TO
  (PERSON Buckingham/NNP Palace/NNP)
  ./.)


In [20]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

ner_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ner_tree)


(S
  Millions/NNS
  of/IN
  people/NNS
  across/IN
  the/DT
  (ORGANIZATION UK/NNP)
  and/CC
  beyond/IN
  have/VBP
  celebrated/VBN
  the/DT
  coronation/NN
  of/IN
  King/NNP
  (PERSON Charles/NNP III/NNP)
  -/:
  a/DT
  symbolic/JJ
  ceremony/NN
  combining/VBG
  a/DT
  religious/JJ
  service/NN
  and/CC
  pageantry/NN
  ./.
  The/DT
  ceremony/NN
  was/VBD
  held/VBN
  at/IN
  (ORGANIZATION Westminster/NNP Abbey/NNP)
  ,/,
  with/IN
  the/DT
  King/NNP
  becoming/VBG
  the/DT
  40th/CD
  reigning/VBG
  monarch/NN
  to/TO
  be/VB
  crowned/VBN
  there/RB
  since/IN
  1066/CD
  ./.
  (PERSON Queen/NNP Camilla/NNP)
  was/VBD
  crowned/VBN
  alongside/RB
  him/PRP
  before/IN
  a/DT
  huge/JJ
  parade/NN
  back/RB
  to/TO
  (PERSON Buckingham/NNP Palace/NNP)
  ./.
  Here/RB
  's/VBZ
  how/WRB
  the/DT
  day/NN
  of/IN
  splendour/NN
  and/CC
  formality/NN
  ,/,
  which/WDT
  featured/VBD
  customs/NNS
  dating/VBG
  back/RB
  more/JJR
  than/IN
  1,000/CD
  years/NNS
  ,/,
  unfolded/

In [21]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

ner_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ner_tree)


(S
  Millions/NNS
  of/IN
  people/NNS
  across/IN
  the/DT
  (ORGANIZATION UK/NNP)
  and/CC
  beyond/IN
  have/VBP
  celebrated/VBN
  the/DT
  coronation/NN
  of/IN
  King/NNP
  (PERSON Charles/NNP III/NNP)
  -/:
  a/DT
  symbolic/JJ
  ceremony/NN
  combining/VBG
  a/DT
  religious/JJ
  service/NN
  and/CC
  pageantry/NN
  ./.
  The/DT
  ceremony/NN
  was/VBD
  held/VBN
  at/IN
  (ORGANIZATION Westminster/NNP Abbey/NNP)
  ,/,
  with/IN
  the/DT
  King/NNP
  becoming/VBG
  the/DT
  40th/CD
  reigning/VBG
  monarch/NN
  to/TO
  be/VB
  crowned/VBN
  there/RB
  since/IN
  1066/CD
  ./.
  (PERSON Queen/NNP Camilla/NNP)
  was/VBD
  crowned/VBN
  alongside/RB
  him/PRP
  before/IN
  a/DT
  huge/JJ
  parade/NN
  back/RB
  to/TO
  (PERSON Buckingham/NNP Palace/NNP)
  ./.
  Here/RB
  's/VBZ
  how/WRB
  the/DT
  day/NN
  of/IN
  splendour/NN
  and/CC
  formality/NN
  ,/,
  which/WDT
  featured/VBD
  customs/NNS
  dating/VBG
  back/RB
  more/JJR
  than/IN
  1,000/CD
  years/NNS
  ,/,
  unfolded/