# NLTK

#### Install NLTK

In [1]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### Download models or corpora

In [2]:
!python -m nltk.downloader

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)
  [ ] cess_cat............ CESS-CAT Treebank
  [

#### Import and use

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
tweet = "RT @lOR42wsOEFcv3f: I fall too fast, crash too hard, forgive too easily and care too much... :( #amiright"

In [5]:
# query = 'fast'
query = 'are'

In [6]:
tweet.find(query)

77

#### Tokenization

In [7]:
tweet.split()

['RT',
 '@lOR42wsOEFcv3f:',
 'I',
 'fall',
 'too',
 'fast,',
 'crash',
 'too',
 'hard,',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much...',
 ':(',
 '#amiright']

In [8]:
["fast" in tweet.split()]

[False]

In [9]:
nltk.word_tokenize(tweet)

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [10]:
[query in nltk.word_tokenize(tweet)]

[False]

In [11]:
nltk.word_tokenize(tweet, language='spanish')

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [12]:
nltk.word_tokenize("We aren't here")

['We', 'are', "n't", 'here']

In [13]:
from nltk.tokenize import RegexpTokenizer
custom_tokenizer = RegexpTokenizer('[a-zA-Z0-9]*', discard_empty=False)

In [14]:
custom_tokenizer.tokenize(tweet)

['RT',
 '',
 '',
 'lOR42wsOEFcv3f',
 '',
 '',
 'I',
 '',
 'fall',
 '',
 'too',
 '',
 'fast',
 '',
 '',
 'crash',
 '',
 'too',
 '',
 'hard',
 '',
 '',
 'forgive',
 '',
 'too',
 '',
 'easily',
 '',
 'and',
 '',
 'care',
 '',
 'too',
 '',
 'much',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'amiright',
 '']

In [15]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [16]:
tweet_tokenizer.tokenize(tweet)

['RT',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [17]:
tweet_tokenizer.tokenize("We aren't here")

['We', "aren't", 'here']

In [18]:
from nltk.tokenize import MWETokenizer
mwe = MWETokenizer()
mwe.add_mwe(('too', 'fast'))
mwe.tokenize(tweet_tokenizer.tokenize(tweet))

['RT',
 ':',
 'I',
 'fall',
 'too_fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [19]:
mwe.add_mwe((('too', 'fast'), ('too', 'hard')))

In [20]:
query = 'fast'
query in mwe.tokenize(tweet_tokenizer.tokenize(tweet))

False

### Normalization

In [21]:
tweet.lower()

'rt @lor42wsoefcv3f: i fall too fast, crash too hard, forgive too easily and care too much... :( #amiright'

In [22]:
import re
import string

def normalize_tokens(tokenized_text):
    # Lowercase
    tokens = [t.lower() for t in tokenized_text]
    # Remove hashtags
    tokens = [t for t in tokens if not t.startswith('#')]
    # Remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]
    # Keep only letters
#     tokens = [t for t in tokens if re.match('^[a-z]+$', t)]
    # Normalize characters
#     tokens = [re.sub('á', 'a', t) for t in tokens]

    return tokens

In [24]:
spanish_query = 'muy rápido'
normalize_tokens(tweet_tokenizer.tokenize(spanish_query))

['muy', 'rápido']

In [25]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
import unidecode
unidecode.unidecode(spanish_query)

'muy rapido'

In [27]:
normalize_tokens(tweet_tokenizer.tokenize(tweet))

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

#### Uniform normalization principle

In [28]:
query = 'TOO fast TOO furious'
tokenized_query = tweet_tokenizer.tokenize(query)
normalized_query = normalize_tokens(tokenized_query)
# normalized_query = tokenized_query
normalized_query

['too', 'fast', 'too', 'furious']

In [29]:
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
# normalized_tweet = normalize_tokens(tweet.split())
normalized_tweet

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [30]:
common_words = set(normalized_query).intersection(normalized_tweet)
print(common_words)
print(len(common_words), "common word(s)")

{'fast', 'too'}
2 common word(s)


#### Stopwords

In [31]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [32]:
from nltk.corpus import stopwords
stopwords.words('romanian')

['a',
 'abia',
 'acea',
 'aceasta',
 'această',
 'aceea',
 'aceeasi',
 'acei',
 'aceia',
 'acel',
 'acela',
 'acelasi',
 'acele',
 'acelea',
 'acest',
 'acesta',
 'aceste',
 'acestea',
 'acestei',
 'acestia',
 'acestui',
 'aceşti',
 'aceştia',
 'adica',
 'ai',
 'aia',
 'aibă',
 'aici',
 'al',
 'ala',
 'ale',
 'alea',
 'alt',
 'alta',
 'altceva',
 'altcineva',
 'alte',
 'altfel',
 'alti',
 'altii',
 'altul',
 'am',
 'anume',
 'apoi',
 'ar',
 'are',
 'as',
 'asa',
 'asta',
 'astea',
 'astfel',
 'asupra',
 'atare',
 'atat',
 'atata',
 'atatea',
 'atatia',
 'ati',
 'atit',
 'atita',
 'atitea',
 'atitia',
 'atunci',
 'au',
 'avea',
 'avem',
 'aveţi',
 'avut',
 'aş',
 'aţi',
 'ba',
 'ca',
 'cam',
 'cand',
 'care',
 'careia',
 'carora',
 'caruia',
 'cat',
 'catre',
 'ce',
 'cea',
 'ceea',
 'cei',
 'ceilalti',
 'cel',
 'cele',
 'celor',
 'ceva',
 'chiar',
 'ci',
 'cind',
 'cine',
 'cineva',
 'cit',
 'cita',
 'cite',
 'citeva',
 'citi',
 'citiva',
 'cu',
 'cui',
 'cum',
 'cumva',
 'cât',
 'câte

In [33]:
blacklist_words = stopwords.words('english') + ['rt']

In [34]:
cleaned_tweet = [t for t in normalized_tweet if t not in blacklist_words]
print(cleaned_tweet)

['fall', 'fast', 'crash', 'hard', 'forgive', 'easily', 'care', 'much', '...', ':(']


#### Stemming / Lemmatization


In [35]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [36]:
# WordNet and OpenMultilingualWordnet necessary for lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
# Trained tagger needed for POS-tagging:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [37]:
stemmer = PorterStemmer()

[stemmer.stem(t) for t in cleaned_tweet]

['fall',
 'fast',
 'crash',
 'hard',
 'forgiv',
 'easili',
 'care',
 'much',
 '...',
 ':(']

In [38]:
stemmer = SnowballStemmer(language='english')

[stemmer.stem(t) for t in cleaned_tweet]

['fall',
 'fast',
 'crash',
 'hard',
 'forgiv',
 'easili',
 'care',
 'much',
 '...',
 ':(']

In [39]:
lemmatizer = WordNetLemmatizer()

[lemmatizer.lemmatize(t) for t in cleaned_tweet]

['fall',
 'fast',
 'crash',
 'hard',
 'forgive',
 'easily',
 'care',
 'much',
 '...',
 ':(']

In [40]:
tagged_tweet = nltk.pos_tag(cleaned_tweet)
print(tagged_tweet)

[('fall', 'NN'), ('fast', 'RB'), ('crash', 'JJ'), ('hard', 'JJ'), ('forgive', 'NN'), ('easily', 'RB'), ('care', 'VB'), ('much', 'JJ'), ('...', ':'), (':(', 'NN')]


In [41]:
from nltk.corpus import wordnet as wn
tag_map = {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV, 'N': wn.NOUN}
def get_lemmas(tokenized_text):
    tagged_text = nltk.pos_tag(tokenized_text)
    return [lemmatizer.lemmatize(w, pos=tag_map.get(p[0], wn.NOUN)) for (w, p) in tagged_text]


In [42]:
query = "the fastest!"
normalized_query = normalize_tokens(tweet_tokenizer.tokenize(query))
print(normalized_query)

['the', 'fastest']


In [43]:
lemmatized_tweet = get_lemmas(normalized_tweet)
lemmatized_query = get_lemmas(normalized_query)
print(lemmatized_tweet)
print(lemmatized_query)

['rt', 'i', 'fall', 'too', 'fast', 'crash', 'too', 'hard', 'forgive', 'too', 'easily', 'and', 'care', 'too', 'much', '...', ':(']
['the', 'fast']


In [44]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
# normalized_tweet
# [lemmatizer.lemmatize(t) for t in normalized_tweet]
get_lemmas(normalized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']

In [45]:
print("Common words:", set(lemmatized_tweet).intersection(set(lemmatized_query)))

Common words: {'fast'}


In [46]:
get_lemmas(cleaned_tweet)

['fall',
 'fast',
 'crash',
 'hard',
 'forgive',
 'easily',
 'care',
 'much',
 '...',
 ':(']

#### Vocabulary

In [47]:
from collections import Counter

Counter(get_lemmas(normalized_tweet)).most_common(5)

[('i', 2), ('be', 2), ('fast', 2), ('so', 1), ('the', 1)]

In [48]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
lemmatized_tweet = get_lemmas(normalized_tweet)
print(lemmatized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']


In [49]:
print(Counter(normalized_tweet))
print(Counter(lemmatized_tweet))

Counter({'i': 2, 'am': 2, 'so': 1, 'fast': 1, 'the': 1, 'fastest': 1})
Counter({'i': 2, 'be': 2, 'fast': 2, 'so': 1, 'the': 1})


#### Sentence segmentation

In [50]:
query = "I am too fast. I am too furious."

In [51]:
from nltk.tokenize import sent_tokenize

In [52]:
sent_tokenize(query)

['I am too fast.', 'I am too furious.']

In [53]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_query = 'Soy muy rápido! Estoy muy furioso!'
spanish_tokenizer.tokenize(spanish_query)

['Soy muy rápido!', 'Estoy muy furioso!']

In [54]:
sent_tokenize("J.K. Rowling is rich. I am not as rich as J.K.")

['J.K. Rowling is rich.', 'I am not as rich as J.K.']

In [55]:
from nltk.tokenize import PunktSentenceTokenizer
PunktSentenceTokenizer??

#### Numericalization

In [56]:
get_lemmas(normalize_tokens(custom_tokenizer.tokenize("STAY TUNED!")))

['stay', 'tune']

# Exercises

Find a tweet / text posted recently on social media in a language different than English. Write a function `normalize_tokens()` that receives as input a list of tokens and is appropriate for normalizing the tokens in the input text (possibly handle special characters, stopwords for the chosen language etc). 

Split the text into sentences, then tokenize the text using one of the tokenizers introduced above, then apply the implemented function.  Did the provided tools offer enough support / which problems did you encounter for specific languages?

In [62]:
french_tweet = "Manifester chez les gens, c'est mal. " \
                + "Eh bien, les Français jettent des éviers de cuisine par la fenêtre de leur président quand " \
                + "ils augmentent le prix de l'essence de 10 cents, et ils ont aussi des soins de santé gratuits, " \
                + "alors je ne suis pas d'accord."

In [63]:
sentences = nltk.tokenize.sent_tokenize(french_tweet, language='french')
sentences

["Manifester chez les gens, c'est mal.",
 "Eh bien, les Français jettent des éviers de cuisine par la fenêtre de leur président quand ils augmentent le prix de l'essence de 10 cents, et ils ont aussi des soins de santé gratuits, alors je ne suis pas d'accord."]

In [65]:
text_tokenized = nltk.tokenize.word_tokenize(french_tweet, language='french')
text_tokenized

['Manifester',
 'chez',
 'les',
 'gens',
 ',',
 "c'est",
 'mal',
 '.',
 'Eh',
 'bien',
 ',',
 'les',
 'Français',
 'jettent',
 'des',
 'éviers',
 'de',
 'cuisine',
 'par',
 'la',
 'fenêtre',
 'de',
 'leur',
 'président',
 'quand',
 'ils',
 'augmentent',
 'le',
 'prix',
 'de',
 "l'essence",
 'de',
 '10',
 'cents',
 ',',
 'et',
 'ils',
 'ont',
 'aussi',
 'des',
 'soins',
 'de',
 'santé',
 'gratuits',
 ',',
 'alors',
 'je',
 'ne',
 'suis',
 'pas',
 "d'accord",
 '.']

In [67]:
french_stopwords = stopwords.words('french')
french_stopwords

['au',
 'aux',
 'avec',
 'ce',
 'ces',
 'dans',
 'de',
 'des',
 'du',
 'elle',
 'en',
 'et',
 'eux',
 'il',
 'ils',
 'je',
 'la',
 'le',
 'les',
 'leur',
 'lui',
 'ma',
 'mais',
 'me',
 'même',
 'mes',
 'moi',
 'mon',
 'ne',
 'nos',
 'notre',
 'nous',
 'on',
 'ou',
 'par',
 'pas',
 'pour',
 'qu',
 'que',
 'qui',
 'sa',
 'se',
 'ses',
 'son',
 'sur',
 'ta',
 'te',
 'tes',
 'toi',
 'ton',
 'tu',
 'un',
 'une',
 'vos',
 'votre',
 'vous',
 'c',
 'd',
 'j',
 'l',
 'à',
 'm',
 'n',
 's',
 't',
 'y',
 'été',
 'étée',
 'étées',
 'étés',
 'étant',
 'étante',
 'étants',
 'étantes',
 'suis',
 'es',
 'est',
 'sommes',
 'êtes',
 'sont',
 'serai',
 'seras',
 'sera',
 'serons',
 'serez',
 'seront',
 'serais',
 'serait',
 'serions',
 'seriez',
 'seraient',
 'étais',
 'était',
 'étions',
 'étiez',
 'étaient',
 'fus',
 'fut',
 'fûmes',
 'fûtes',
 'furent',
 'sois',
 'soit',
 'soyons',
 'soyez',
 'soient',
 'fusse',
 'fusses',
 'fût',
 'fussions',
 'fussiez',
 'fussent',
 'ayant',
 'ayante',
 'ayantes',


In [70]:
import unicodedata

def normalize_tokens(tokenized_text, stopwords=[]):
  # Lowercasing
  tokens = [t.lower() for t in tokenized_text]
  # Remove punctuation
  tokens = [t for t in tokens if t not in string.punctuation]
  # Remove stopwords
  tokens = [t for t in tokens if t not in stopwords]
  # Normalize
  tokens = [unicodedata.normalize('NFKD', t).encode('ascii', 'ignore').decode('utf-8', 'ignore') for t in tokens]

  return tokens

normalized_text = ' '.join(normalize_tokens(text_tokenized, french_stopwords))
print('Original text: ', french_tweet)
print('Preprocessed text: ', normalized_text)

Original text:  Manifester chez les gens, c'est mal. Eh bien, les Français jettent des éviers de cuisine par la fenêtre de leur président quand ils augmentent le prix de l'essence de 10 cents, et ils ont aussi des soins de santé gratuits, alors je ne suis pas d'accord.
Preprocessed text:  manifester chez gens , c'est mal . eh bien , francais jettent eviers cuisine fenetre president quand augmentent prix l'essence 10 cents , aussi soins sante gratuits , alors d'accord .


There are several low-resource languages, such as French, that lack adequate resources for NLP approaches. This is due the fact that foreign languages (other than English) lack linguistic knowledge, which can only be developed by specialists and native speakers, as well as the fact that it takes a huge amount of annotated data, which is frequently expensive to acquire.