# NLTK

#### Install NLTK

In [1]:
%%bash
pip install nltk



#### Download models or corpora

In [3]:
%%bash
import nltk
python -m nltk.downloader # shows a window when graphical output available

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> 

bash: line 1: import: command not found
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/nltk/downloader.py", line 982, in _interactive_download
    DownloaderGUI(self).mainloop()
  File "/usr/local/lib/python3.7/dist-packages/nltk/downloader.py", line 1226, in __init__
    top = self.top = Tk()
  File "/usr/lib/python3.7/tkinter/__init__.py", line 2023, in __init__
    self.tk = _tkinter.create(screenName, baseName, className, interactive, wantobjects, useTk, sync, use)
_tkinter.TclError: no display name and no $DISPLAY environment variable

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/nltk/downloader.py", line 2278, in <module>
    halt_on_error=options.halt_on_erro

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Tokenization

In [5]:
tweet = "RT @lOR42wsOEFcv3f: I fall too fast, crash too hard, forgive too easily and care too much... :( #amiright"

In [6]:
query = 'fast'

The naive way...

In [7]:
tweet.find(query)

31

In [8]:
tweet.split()

['RT',
 '@lOR42wsOEFcv3f:',
 'I',
 'fall',
 'too',
 'fast,',
 'crash',
 'too',
 'hard,',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much...',
 ':(',
 '#amiright']

In [11]:
[query in tweet.split()]

[False]

Correct tokenization: informed splitting of the text into tokens

In [12]:
nltk.word_tokenize(tweet)

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [13]:
[query in nltk.word_tokenize(tweet)]
# query

[True]

In [14]:
nltk.word_tokenize(tweet, language='spanish')

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

More options...

In [15]:
from nltk.tokenize import RegexpTokenizer
custom_tokenizer = RegexpTokenizer('[a-zA-Z0-9]*', discard_empty=False)

In [16]:
custom_tokenizer.tokenize(tweet)

['RT',
 '',
 '',
 'lOR42wsOEFcv3f',
 '',
 '',
 'I',
 '',
 'fall',
 '',
 'too',
 '',
 'fast',
 '',
 '',
 'crash',
 '',
 'too',
 '',
 'hard',
 '',
 '',
 'forgive',
 '',
 'too',
 '',
 'easily',
 '',
 'and',
 '',
 'care',
 '',
 'too',
 '',
 'much',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'amiright',
 '']

In [17]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [18]:
tweet_tokenizer.tokenize(tweet)

['RT',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [19]:
from nltk.tokenize import MWETokenizer
mwe = MWETokenizer()
mwe.add_mwe(('too', 'fast'))
mwe.tokenize(tweet_tokenizer.tokenize(tweet))

['RT',
 ':',
 'I',
 'fall',
 'too_fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [20]:
mwe.add_mwe((('too', 'fast'), ('too', 'hard')))

In [21]:
query = 'fast'
query in mwe.tokenize(tweet_tokenizer.tokenize(tweet))

False

### Normalization

In [22]:
tweet.lower()

'rt @lor42wsoefcv3f: i fall too fast, crash too hard, forgive too easily and care too much... :( #amiright'

In [23]:
import re
import string

def normalize_tokens(tokenized_text):
    # Lowercase
    tokens = [t.lower() for t in tokenized_text]
    # Remove hashtags
    tokens = [t for t in tokens if not t.startswith('#')]
    # Remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]
    # Keep only letters
#     tokens = [t for t in tokens if re.match('^[a-z]+$', t)]
    # Normalize characters
#     tokens = [re.sub('á', 'a', t) for t in tokens]

    return tokens

In [24]:
spanish_query = 'muy rápido'
normalize_tokens(tweet_tokenizer.tokenize(spanish_query))

['muy', 'rápido']

In [25]:
!pip install unidecode
import unidecode
unidecode.unidecode(spanish_query)

Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 28.3 MB/s eta 0:00:01[K     |██▉                             | 20 kB 29.4 MB/s eta 0:00:01[K     |████▏                           | 30 kB 19.5 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 12.6 MB/s eta 0:00:01[K     |███████                         | 51 kB 5.4 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 6.3 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 7.1 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 5.7 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 6.4 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 6.9 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 6.9 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 6.9 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 6.9 MB/s eta 0:00:0

'muy rapido'

In [26]:
normalize_tokens(tweet_tokenizer.tokenize(tweet))

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

#### Uniform normalization principle

In [27]:
query = 'TOO fast TOO furious'
tokenized_query = tweet_tokenizer.tokenize(query)
normalized_query = normalize_tokens(tokenized_query)
# normalized_query = tokenized_query
normalized_query

['too', 'fast', 'too', 'furious']

In [28]:
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
# normalized_tweet = normalize_tokens(tweet.split())
normalized_tweet

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [29]:
common_words = set(normalized_query).intersection(normalized_tweet)
print(common_words)
print(len(common_words), "common word(s)")

{'too', 'fast'}
2 common word(s)


#### Stemming / Lemmatization


In [30]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer


In [31]:
stemmer = PorterStemmer()

[stemmer.stem(t) for t in normalized_tweet]

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgiv',
 'too',
 'easili',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [32]:
stemmer = nltk.LancasterStemmer() # is prone to overstemming
[stemmer.stem(t) for t in normalized_tweet]


['rt',
 'i',
 'fal',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forg',
 'too',
 'easy',
 'and',
 'car',
 'too',
 'much',
 '...',
 ':(']

In [33]:
stemmer = SnowballStemmer(language='english') # Porter2

[stemmer.stem(t) for t in normalized_tweet]

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgiv',
 'too',
 'easili',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [34]:
print(stemmer.stem("running"))

print(stemmer.stem("runs"))

print(stemmer.stem("ran"))

print(stemmer.stem("darling"))

print(stemmer.stem("are"))

print(stemmer.stem("bring"))

print(stemmer.stem("being"))

print(stemmer.stem("Charles"))


run
run
ran
darl
are
bring
be
charl


In [35]:
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[lemmatizer.lemmatize(t) for t in normalized_tweet]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [36]:
nltk.download('averaged_perceptron_tagger')


tagged_tweet = nltk.pos_tag(normalized_tweet)
print(tagged_tweet)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[('rt', 'NN'), ('i', 'NN'), ('fall', 'VBP'), ('too', 'RB'), ('fast', 'JJ'), ('crash', 'NN'), ('too', 'RB'), ('hard', 'JJ'), ('forgive', 'JJ'), ('too', 'RB'), ('easily', 'RB'), ('and', 'CC'), ('care', 'VB'), ('too', 'RB'), ('much', 'JJ'), ('...', ':'), (':(', 'NN')]


In [37]:
from nltk.corpus import wordnet as wn
tag_map = {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV, 'N': wn.NOUN}
def get_lemmas(tokenized_text):
    tagged_text = nltk.pos_tag(tokenized_text)
    return [lemmatizer.lemmatize(w, pos=tag_map.get(p[0], wn.NOUN)) for (w, p) in tagged_text]


In [38]:
query = "the fastest!"
normalized_query = normalize_tokens(tweet_tokenizer.tokenize(query))
print(normalized_query)

['the', 'fastest']


In [39]:
lemmatized_tweet = get_lemmas(normalized_tweet)
lemmatized_query = get_lemmas(normalized_query)
print(lemmatized_tweet)
print(lemmatized_query)


['rt', 'i', 'fall', 'too', 'fast', 'crash', 'too', 'hard', 'forgive', 'too', 'easily', 'and', 'care', 'too', 'much', '...', ':(']
['the', 'fast']


In [40]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
normalized_tweet


['i', 'am', 'so', 'fast', 'i', 'am', 'the', 'fastest']

In [41]:
[lemmatizer.lemmatize(t) for t in normalized_tweet]


['i', 'am', 'so', 'fast', 'i', 'am', 'the', 'fastest']

In [42]:
get_lemmas(normalized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']

In [43]:
print("Common words:", set(lemmatized_tweet).intersection(set(lemmatized_query)))

Common words: {'fast'}


#### Stopwords

In [44]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [45]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [46]:
blacklist_words = stopwords.words('english') + ['rt']

In [47]:
cleaned_tweet = [t for t in normalized_tweet if t not in blacklist_words]
print(cleaned_tweet)

['fast', 'fastest']


#### Vocabulary

In [48]:
from collections import Counter

Counter(get_lemmas(normalized_tweet)).most_common(5)

[('i', 2), ('be', 2), ('fast', 2), ('so', 1), ('the', 1)]

In [49]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
lemmatized_tweet = get_lemmas(normalized_tweet)
print(lemmatized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']


In [50]:
print(Counter(normalized_tweet))
print(Counter(lemmatized_tweet))

Counter({'i': 2, 'am': 2, 'so': 1, 'fast': 1, 'the': 1, 'fastest': 1})
Counter({'i': 2, 'be': 2, 'fast': 2, 'so': 1, 'the': 1})


#### Sentence segmentation

In [52]:
query = "I am too fast. I am too furious."

In [53]:
from nltk.tokenize import sent_tokenize

In [54]:
sent_tokenize(query)

['I am too fast.', 'I am too furious.']

In [55]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_query = 'Soy muy rápido! Estoy muy furioso!'
spanish_tokenizer.tokenize(spanish_query)

['Soy muy rápido!', 'Estoy muy furioso!']

In [56]:
sent_tokenize("J.K. Rowling is rich. I am not as rich as J.K.")

['J.K. Rowling is rich.', 'I am not as rich as J.K.']

In [57]:
from nltk.tokenize import PunktSentenceTokenizer
PunktSentenceTokenizer??

#### Numeral conversion

In [58]:
!pip install word2number
!pip install num2word

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5582 sha256=e04c813555acf342b58dd89f5a46806f61679fda98124a00071a9c628437aab5
  Stored in directory: /root/.cache/pip/wheels/4b/c3/77/a5f48aeb0d3efb7cd5ad61cbd3da30bbf9ffc9662b07c9f879
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1
Collecting num2word
  Downloading num2word-1.0.1-py3-none-any.whl (9.5 kB)
Installing collected packages: num2word
Successfully installed num2word-1.0.1


In [64]:
import word2number
from word2number import w2n
w2n.word_to_num("eleven")

11

In [65]:
w2n.word_to_num("twenty three")

23

In [70]:
!pip install num2words

import num2words
num2words.num2words(12)

Collecting num2words
  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)
[?25l[K     |███▎                            | 10 kB 30.6 MB/s eta 0:00:01[K     |██████▌                         | 20 kB 30.4 MB/s eta 0:00:01[K     |█████████▊                      | 30 kB 13.7 MB/s eta 0:00:01[K     |█████████████                   | 40 kB 10.7 MB/s eta 0:00:01[K     |████████████████▏               | 51 kB 5.9 MB/s eta 0:00:01[K     |███████████████████▍            | 61 kB 7.0 MB/s eta 0:00:01[K     |██████████████████████▋         | 71 kB 7.6 MB/s eta 0:00:01[K     |█████████████████████████▉      | 81 kB 6.2 MB/s eta 0:00:01[K     |█████████████████████████████   | 92 kB 6.9 MB/s eta 0:00:01[K     |████████████████████████████████| 101 kB 5.1 MB/s 
Installing collected packages: num2words
Successfully installed num2words-0.5.10


'twelve'

In [71]:
num2words.num2words(101)

'one hundred and one'

In [73]:
num2words.num2words(2020)

'two thousand and twenty'

In [74]:
w2n.word_to_num("Twelve o'clock!")

12

### Exercise

Find a recent news article online.
Read it in a python variable (input it manually or read from a file).

Write a function that normalizes the text and splits it into tokens. Add flags to customize the different preprocessing choices (which stemmer/lemmatizer to use, whether to lowercase, whether to convert numbers, whether to remove stopwords, ...). 

Store the vocabulary of unique tokens found in the text.

Compare the number of unique tokens ("types") with different preprocessing settings.


The data is pre-processed in multiple steps as follows:

*   Emoticons and emojis are replaced with the corresponding words.
*   The URL addresses are replaced by the <URL> token. Hyperlinks and HTML tags are removed, as well as the old-style for highlighting redistributed tweets
*   We eliminate all email addresses.
*   The text is decoded and then normalized, i.e., the data is transformed from complex symbols into simple characters. Characters can be subjected to various forms of encoding, such as Latin, ISO/IEC 8859-1, etc. Therefore, for better analysis, it is necessary to keep the data in a standard encoding format. For this requirement, we choose UTF-8 encoding because it is widely accepted and often recommended.
*   Bounded words are separated by inserting a space if the user wants to keep them, otherwise they will be deleted along with the hashtags. Most posts on social networks such as Facebook, Twitter, or Instagram contain one or more words without spaces and are preceded by the # sign such as #MentalHealthAwarenessWeek or #BeautifulDay, called a hashtag. A hashtag is a tag that makes it easy to find posts in a specific category or with certain content. Therefore, the words in the hashtags provide essential information about the general feeling of the user, predominant topic etc. 
*   Lower case
*   Tokenization
*   Any letter repeated more than three times in a row is replaced by two repetitions of the same letter as the usual rules of English spelling forbid triple letters (for example "cooool" is replaced by "cool").
*   The user can decide whether the stopwords should be removed or not.
*   The user can decide whether punctuation and/or numeric characters should be removed. Also, in the case of keeping numeric characters, the user can choose to convert them into specific words.
*   The user can choose to perform the stemmatization or lemmatization process.


In [85]:
!pip install bs4
!pip install emoji

Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 7.4 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=15e2c4e2a0c190266ae6a9cf8defda06397e03ad12f4da80246068f137986d69
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [124]:
from bs4 import BeautifulSoup
from nltk import word_tokenize
from emoji.core import demojize

import string, re, unicodedata
import word2number 

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [116]:
mail_reg = r'^(\w|\.|\_|\-)+[@](\w|\_|\-|\.)+[.]\w{2,3}$'
url_addresses_reg = r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?|http:/\W"|http:\/\/\w\W\.\W'
retweets_reg = r'^rt[\s]+|rt '

In [118]:
# remove HTML tags
def remove_html(data):
  bs = BeautifulSoup(data, 'html.parser')
  return bs.get_text()

# remove URL addresses
def remove_url(data):
  return re.sub(url_addresses_reg, ' ', data)

# remove email addresses
def remove_email_addresses(data):
  return re.sub(mail_reg, ' ', data)

In [86]:
EMOTICONS = [
    ("Laughing", r':[‑,-]?\){2,}'),
    ("Rolling_on_the_floor_laughing", r'\=\){2,}|\=\]'),
    ("Heart", r'<3'),
    ("Broken_heart", r'<\\3'),
    ('Very_happy', r':[‑,-]?D'),
    ('Happy_face_or_smiley', r'[:,8,=][‑,-,o,O]?\)|\(\^[v,u,o,O]\^\)|:[‑,-]?3'),
    ('Happy', r'=]'),
    ('Mischievous_smile', r':[‑,-]?>'),
    ('Sticking_tongue_out_playfulness_or_cheekiness', r':P|:[‑,-]P|;P|:b|:-b'),
    ('Kiss', r':[‑,-]?[\*,X,x]'),
    ('Joy', r' uwu | UwU '),
    ('Surprised_or_shock', r':[‑,-]?[o|O|0]|o_O|o_0'),
    ('Sad_frown_andry_or_pouting', r':[‑,-]?\('),
    ('Very_sad', r':[(]{2,}'),
    ('Crying', r':[‑,-]?\'\('),
    ('Straight_face_no_expression_dissaproval_or_not_funny', r':[‑,-]?\|'),
    ('Annoyed_or_hesitant', r'>?[:][\\|\/]|\=\/|=\\'),
    ('Angel_saint_or_innocent', r'[0,O,o]:[‑,-]?[\),3]'),
    ('Embarrassed_or_blushing', r':\$'),
    ('Sad_or_crying', r';_;|\(;_;\)|\(\'_\'\)|Q_Q|\(;_:\)|\(:_;\)'),
    ('Evil_or_devilish', r'[>|}|3]:[‑,-]?\)'),
    ('Laughing_big_grin_or_laugh_with_glasses', r'[:,8,X,=][-,‑]?[D,3]|B\^D'),
    ('Tears_of_happiness', r':[\',\`][‑,-]?\)'),
    ('Horror', r'D[-,‑]\''),
    ('Great_dismay', r'D[8,;,=]'),
    ('Tongue_in_cheek', r':[-,‑]J'),
    ('Yawn', r'8[‑,-]0|>:O'),
    ('Sadness', r'D:'),
    ('Disgust', r'D:<'),
    ('Cool', r'\|;[‑,-]\)'),
    ('Drunk_or_confused', r'%[-,‑]?\)'),
    ('Sealed_lips_or_wearing_braces_or_tongue_tied', r':[-,‑]?[x,#,&]'),
    ('Skeptical_annoyed_undecided_uneasy_or_hesitant', r':[-,‑]?[.,/]|:[L,S]|=[/,L]'),
    ('Scepticism_disbelief_or_disapproval', r'\',:-\||\',:[-,-]'),
    ('Party_all_night', r'#‑\)'),
    ('Headphones_listening_to_music', r'\(\(d\[-_-\]b\)\)'),
    ('Bored', r'\|‑O'),
    ('Dump', r'<:‑\|'),
    ('Being_sick', r':-?#{2,3}..'),
    ('Amazed', r'\(\*_\*\)|\(\+_\+\)|\(\@_\@\)'),
    ('Confusion', r'\(\?_\?\)|\(\・\・?'),
    ('Wink_or_smirk', r';[-,‑]?[\),D,\]]|\*[-,‑]?\)|;\^\)|:‑,|;3'),
    ('Exciting', r'\\\(\^o\^\)\/|\\\(\^o\^\)\／|ヽ\(\^o\^\)丿|\(\*^0^\*\)|＼\(-o-\)／|＼\(~o~\)\／'),
    ('Giggling_with_hand_covering_mouth', r'\^m\^'),
    ('Joyful', r'\(\^_\^\)/|\(\^[O,o]\^\)／|\(°o°\)'),
    ('Tired', r'\(=_=\)'),
    ('Shame', r'\(-_-\)|\(一_一\)'),
    ('Surprised', r'\(o\.o\)'),
    ('Sleeping', r'\(-_-\)zzz'),
    ('Kowtow_as_a_sign_of_respect_or_dogeza_for_apology', r'\(__\)|_\(\._\.\)_|<\(_ _\)>|m\(_ _\)m|m\(__\)m|<m\(__\)m>|_\(_\^_\)_'),
    ('Troubled', r'\(>_<\)>?'),
    ('Nervous__Embarrassed_Troubled_Shy_Sweat_drop', r'\(-_-;\)|\(\^_\^;\)|\(-_-;\)|\(~_~;\)|\(・.・;\)|\(・_・;\)'),
    ('Wink', r'\(\^_-\)'),
    ('Normal_laugh', r'>\^_\^<|<\^!\^>|\(\^\.\^\)|\(\^J\^\)|\(\*\^[_,.]\^\*\)|\(\^<\^\)|\(\^\.\^\)|\(#\^\.\^#\)'),
    ('STH_ELSE', r'.')
]

emoticons_tokens = '|'.join('(?P<%s>%s)' % emoticon for emoticon in EMOTICONS)

def replace_emoticons(text):
    new_text = ""
    for match in re.finditer(emoticons_tokens, text):
        emoticon_name = match.lastgroup
        emoticon = match.group(emoticon_name)
        if emoticon_name == 'STH_ELSE':
            new_text += emoticon
        else:
            new_text += emoticon_name
    return new_text

In [87]:
def normalize_text(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore') # remove accented chars
    return x

In [166]:
# any letter repeated more than three times in a row is replaced by two repetitions of the same letter
def remove_multiple_occurences(text):
    n = len(text)

    if n < 3:
        return text

    i, count = 0, 0
    while i < n - 1:
        i += 1
        if text[i] != text[i-1]:
            count = 0
        else:
            count += 1
            if count >= 2:
                text = text[:i] + text[i+1:]
                n -= 1
                i -= 1

    return text

In [181]:
stemmer = PorterStemmer()
lemmatizaer = WordNetLemmatizer()

def preprocessing_text(text, language="english", remove_stopwords=False, remove_punctuation=False, remove_nums=False, convert_nums=True, stemming=False, lemmatization=False):
  # Remove URL addresses, tags, retweets, and email addresses
  preprocessed_text = remove_url(text)
  preprocessed_text = remove_html(preprocessed_text)
  preprocessed_text = remove_email_addresses(preprocessed_text)
  preprocessed_text = re.sub(retweets_reg, ' ', preprocessed_text)
  # Lowecasing
  preprocessed_text = preprocessed_text.lower()
  # Tokenize words
  tokens = word_tokenize(preprocessed_text)
  # Conversion of emojis to words
  tokens = [demojize(token) for token in tokens]
  # Conversion of emoticons to words
  tokens = [replace_emoticons(token) for token in tokens]
  # Normalize text
  preprocessed_text = unicodedata.normalize('NFKD', preprocessed_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  # Remove repeated letters from words
  tokens = [remove_multiple_occurences(token) for token in tokens]
  # Remove STOPWORDS
  if remove_stopwords:
    tokens = [token for token in tokens if token is not None and token not in stopwords.words(language)]
  # Remove punctuations
  if remove_punctuation:
    tokens = [token for token in tokens if token is not None and token not in string.punctuation]
  # Remove numerical_characters
  if remove_nums:
    tokens = [token for token in tokens if token is not None and token not in string.digits]
  else:
    if convert_nums:
      # Conversion of numerical characters to words
      tokens = [num2words.num2words(float(token)) if (token.isdigit() or token.isdecimal()) else token for token in tokens]

  if stemming:
    tokens = [stemmer.stem(token) for token in tokens]

  if lemmatization:
    tokens = [lemmatizaer.lemmatize(token) for token in tokens]

  return tokens

In [148]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [154]:
with open('/content/drive/MyDrive/news_article.txt') as f:
    text = f.read()
    print(text)

The Yankee
From Wikipedia, the free encyclopedia
Jump to navigationJump to search
This article is about a literary magazine published 1828–1829. For the magazine founded in 1935, see Yankee (magazine).
The Yankee
Stained white paper with words printed in three columns in blank ink below the magazine title and motto
First page of the first issue: January 1, 1828
Editor	John Neal
Categories	Literature, gymnastics, New England, England, art, theater, politics, utilitarianism, women's rights
Frequency	Weekly (January 1, 1828 – July 3, 1829)
Monthly (July–December 1829)
Founder	John Neal
First issue	January 1, 1828
Final issue	December 1829
Based in	Portland, Maine, US
The Yankee (later retitled The Yankee and Boston Literary Gazette) was one of the first cultural publications in the US, founded and edited by John Neal (1793–1876), and published in Portland, Maine. The magazine was unique at the time for its independent journalism. Neal used creative control of the magazine to improve his s

In [190]:
from copy import deepcopy

text_preprocessed1 = preprocessing_text(deepcopy(text), language="english")
text_preprocessed2 = preprocessing_text(deepcopy(text), language="english", remove_stopwords=True, remove_punctuation=True, remove_nums=True)
text_preprocessed3 = preprocessing_text(deepcopy(text), language="english", remove_stopwords=True, remove_punctuation=True, remove_nums=False, convert_nums=True)
text_preprocessed4 = preprocessing_text(deepcopy(text), language="english", convert_nums=True)
text_preprocessed5 = preprocessing_text(deepcopy(text), language="english", convert_nums=True, lemmatization=True)

In [191]:
from collections import Counter 

vocabulary1 = Counter(text_preprocessed1)
vocabulary2 = Counter(text_preprocessed2)
vocabulary3 = Counter(text_preprocessed3)
vocabulary4 = Counter(text_preprocessed4)
vocabulary5 = Counter(text_preprocessed5)

In [192]:
print('Number of tokens for the first type of preprocessing: ', len(vocabulary1))
print('Number of tokens for the first type of preprocessing: ', len(vocabulary2))
print('Number of tokens for the first type of preprocessing: ', len(vocabulary3))
print('Number of tokens for the first type of preprocessing: ', len(vocabulary4))
print('Number of tokens for the first type of preprocessing: ', len(vocabulary5))

Number of tokens for the first type of preprocessing:  999
Number of tokens for the first type of preprocessing:  888
Number of tokens for the first type of preprocessing:  897
Number of tokens for the first type of preprocessing:  999
Number of tokens for the first type of preprocessing:  960
