In [13]:
from urllib import request
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/amarov/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/amarov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/amarov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/amarov/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/amarov/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

First we divide the articles into individual sentences by splitting the string by ".".

## Text normalisation

### Sentence tokenization


In [14]:
ballet_url = "https://raw.githubusercontent.com/boyko/text-analytics-script/main/data/ballet.txt"
url_handle = request.urlopen(ballet_url)
articles = url_handle.read().decode("utf-8")

In [15]:

naive_sentences = re.split(r"[.!?] ", articles)
sentences = nltk.sent_tokenize(articles)

print(sentences[:10])

['With apologies to James Brown, the hardest working people in show business may well be ballet dancers.', 'And at New York City Ballet, none work harder than the dancers in its lowest rank, the corps de ballet.', 'During the first week of the company’s winter season, Claire Kretzschmar, 24, a rising corps member, danced in all seven performances, appearing in five ballets, sometimes changing costumes at intermission to dance two roles in a night.', 'But her work onstage did not even begin to capture the stamina required to be in the corps.', 'Spending a week shadowing Ms. Kretzschmar was exhausting — she gave new meaning to the idea of being on your feet all day.', 'Twelve-hour days at the David H. Koch Theater, the company’s Lincoln Center home, were hardly unusual: Company class each morning was followed by back-to-back-to-back rehearsals, with occasional breaks for costume fittings or physical therapy, and then by the hair-makeup-costume-dance routine of daily performances.', 'Vide

In [16]:
## Compare with naive split on "."
print(
    f"Number of sentences from sent_tokenize: {len(sentences)}. Number of sentences from naive split: {len(naive_sentences)}."
)

Number of sentences from sent_tokenize: 91. Number of sentences from naive split: 116.



### Word tokenization

Splitting a sentence into tokens (words and punctuation).

In [17]:
words = [nltk.word_tokenize(w) for w in sentences]
words

[['With',
  'apologies',
  'to',
  'James',
  'Brown',
  ',',
  'the',
  'hardest',
  'working',
  'people',
  'in',
  'show',
  'business',
  'may',
  'well',
  'be',
  'ballet',
  'dancers',
  '.'],
 ['And',
  'at',
  'New',
  'York',
  'City',
  'Ballet',
  ',',
  'none',
  'work',
  'harder',
  'than',
  'the',
  'dancers',
  'in',
  'its',
  'lowest',
  'rank',
  ',',
  'the',
  'corps',
  'de',
  'ballet',
  '.'],
 ['During',
  'the',
  'first',
  'week',
  'of',
  'the',
  'company',
  '’',
  's',
  'winter',
  'season',
  ',',
  'Claire',
  'Kretzschmar',
  ',',
  '24',
  ',',
  'a',
  'rising',
  'corps',
  'member',
  ',',
  'danced',
  'in',
  'all',
  'seven',
  'performances',
  ',',
  'appearing',
  'in',
  'five',
  'ballets',
  ',',
  'sometimes',
  'changing',
  'costumes',
  'at',
  'intermission',
  'to',
  'dance',
  'two',
  'roles',
  'in',
  'a',
  'night',
  '.'],
 ['But',
  'her',
  'work',
  'onstage',
  'did',
  'not',
  'even',
  'begin',
  'to',
  'capture'

### Stemming and Lemmatization

The basic independent unit in natural languages are the _morphemes_. These arise by inflecting
a stem with some affix (e.g. prefixes, suffixes)

- *Stemming*: find the stem of the word
- *Lemmatization*: find the base form (lemma) of the word by removing affixes. The root form of the word must be present in the dictionary.

In [18]:
p_stemmer = PorterStemmer()
lc_stemmer = LancasterStemmer()

p_stemmer.stem("strange")
lc_stemmer.stem("went")

'went'

To reduce all words to their base form we can use a lemmatizer. The `nltk` package provides


In order for the lemmatizer to work it requires the POS tags of the words in the sentence.
The `WorldNetLemmatizer`

In [34]:
wn_lemmatizer = nltk.WordNetLemmatizer()
wn_lemmatizer.lemmatize('cars', 'n')

'car'

In [35]:
wn_lemmatizer.lemmatize('running', 'v')

'run'

The word is returned unchanged if no match is found. The default pos tag is "n" (noun).

In [36]:
wn_lemmatizer.lemmatize("gsdgfsdf")

'gsdgfsdf'

## What is WordNet?

WordNet is a manually constructed lexical database that groups words into set of synonyms (synsets). Furthermore, it describes a hierarchical relationships (is part of) between words. 

In [47]:
from nltk.corpus import wordnet as wn

synset = wn.synsets('lion')[0]

print("Name of the synset", synset.name())
print("Meaning of the synset : ", synset.definition())
print("Hypernyms ", synset.hypernyms())

Name of the synset lion.n.01
Meaning of the synset :  large gregarious predatory feline of Africa and India having a tawny coat with a shaggy mane in the male
Hypernyms  [Synset('big_cat.n.01')]


## Part of speech tagging

Part of speech (POS) tagging refers to determining the role of each word within a sentence, e.g.:

1. CD  Cardinal number
    e.g. 1, 20
2. DT  Determiner
    e.g. a/an, the, 2, some, many
3. JJ  Adjective
    e.g. good, big, red
4. JJR Adjective, comparative
5. MD  Modal
    e.g. can, must, may, might, will, would, should
6. NN  Noun, singular or mass
    e.g. day, cat
7. NNS Noun, plural
    e.g. cats, dogs
8. RB  Adverb
    e.g. quickly, silently, well, badly, very, really
9. RBR Adverb, comparative
    e.g. cheaper/more cheaply, slower/more slowly
10. RBS Adverb, superlative
    e.g. fastest, hardest
11. VB  Verb, base form
    e.g. be, have
12. WP  Wh-pronoun
    e.g. I, you, he, she, some
13. WP$ Possessive wh-pronoun
    e.g. mine, his, her, your, yours

for the full list of Penn Treebank POS tags see their [website](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html).

Here we use the default POS tagger from `nltk` to label each word.

In [37]:
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from collections import defaultdict

tag_map = defaultdict(lambda: wn.NOUN, J=wn.ADV, V=wn.VERB, R=wn.ADV)

sentence = "It is better to be hated for what you are than to be loved for what you are not."

tokens = nltk.word_tokenize(sentence)

for token, tag in nltk.pos_tag(tokens):
    lemma = wn_lemmatizer.lemmatize(token)
    print(f"{token:10} {tag:10} {lemma}")

It         PRP        It
is         VBZ        is
better     RBR        better
to         TO         to
be         VB         be
hated      VBN        hated
for        IN         for
what       WP         what
you        PRP        you
are        VBP        are
than       IN         than
to         TO         to
be         VB         be
loved      VBN        loved
for        IN         for
what       WP         what
you        PRP        you
are        VBP        are
not        RB         not
.          .          .


In [30]:
tag_map

defaultdict(<function __main__.<lambda>()>,
            {'J': 'r',
             'V': 'v',
             'R': 'r',
             'P': 'n',
             'T': 'n',
             'I': 'n',
             'W': 'n',
             '.': 'n'})

### Stop words

Natural language contains a lot of words that appear very frequently in all text but that contribute little
to the meaning of the text, for example: "and", "the", etc. Such words are commonly removed from the text in the process of normalization.


In [18]:
stopwords_en = nltk.corpus.stopwords.words("english")
stopwords_en

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

One way to remove the stop words from a list of words is to simply use list comprehension.

In [19]:
words = "A room without books is like a body without a soul. Cicero.".lower()
list_of_words = words.split(" ")

list_of_words_filtered = list([w for w in list_of_words if w not in stopwords_en])
list_of_words_filtered

['room', 'without', 'books', 'like', 'body', 'without', 'soul.', 'cicero.']

In [None]:
import re
import string
from nltk import tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
punct_table = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))


def tokenize_text(text):
    normalized_sentences = []
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()

    sents = tokenize.sent_tokenize(text)

    for sent in sents:
        # remove punctuation
        sent = sent.translate(punct_table)
        # strip leading/trailing whitespace
        sent = sent.strip()
        wrds = tokenize.word_tokenize(sent)
        # apply a stemmer
        wrds_without_stopwords = [stemmer.stem(wrd) for wrd in wrds if wrd not in stop_words]
        # new_sentence = ' '.join(filtered)
        normalized_sentences.append(wrds_without_stopwords)

    return normalized_sentences


def tokenize_corpus(corpus):
    sents = []
    for text in corpus:
        sents.extend(tokenize_text(text))

    return sents


## spaCy


In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [17]:
doc = nlp(
    "The U.S. negotiated truce is set to end on November 28th, after four days of quiet meant to facilitate the exchange of 50 Israeli hostages held in Gaza for 150 Palestinian prisoners in Israeli jails.")

In [18]:
for token in doc:
    print(token.text, end=' | ')

The | U.S. | negotiated | truce | is | set | to | end | on | November | 28th | , | after | four | days | of | quiet | meant | to | facilitate | the | exchange | of | 50 | Israeli | hostages | held | in | Gaza | for | 150 | Palestinian | prisoners | in | Israeli | jails | . | 

In [19]:
for token in doc:
    print(f"{token.text:10} {token.pos_:10} {token.lemma_}")

The        DET        the
U.S.       PROPN      U.S.
negotiated VERB       negotiate
truce      NOUN       truce
is         AUX        be
set        VERB       set
to         PART       to
end        VERB       end
on         ADP        on
November   PROPN      November
28th       NOUN       28th
,          PUNCT      ,
after      ADP        after
four       NUM        four
days       NOUN       day
of         ADP        of
quiet      ADJ        quiet
meant      VERB       mean
to         PART       to
facilitate VERB       facilitate
the        DET        the
exchange   NOUN       exchange
of         ADP        of
50         NUM        50
Israeli    ADJ        israeli
hostages   NOUN       hostage
held       VERB       hold
in         ADP        in
Gaza       PROPN      Gaza
for        ADP        for
150        NUM        150
Palestinian ADJ        palestinian
prisoners  NOUN       prisoner
in         ADP        in
Israeli    ADJ        israeli
jails      NOUN       jail
.          PU

In [21]:
# Stopwords

print(nlp.vocab['dog'].is_stop)
print(nlp.vocab['the'].is_stop)
print(doc[0].is_stop)

False
True
True
