# **Tokenization**

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

sent1 = "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom."
sent2 = "Life is pleasant. Death is peaceful. It's the transition that's troublesome." 
tokens1 = word_tokenize(sent1)
tokens2 = sent_tokenize(sent2)
print(tokens1)
print(tokens2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['The', 'saddest', 'aspect', 'of', 'life', 'right', 'now', 'is', 'that', 'science', 'gathers', 'knowledge', 'faster', 'than', 'society', 'gathers', 'wisdom', '.']
['Life is pleasant.', 'Death is peaceful.', "It's the transition that's troublesome."]


# **N-GRAMS**

In [2]:
unigram = list(nltk.ngrams(tokens1, 1))
bigram = list(nltk.ngrams(tokens1, 2))
print(unigram[:5])
print(bigram[:5])

[('The',), ('saddest',), ('aspect',), ('of',), ('life',)]
[('The', 'saddest'), ('saddest', 'aspect'), ('aspect', 'of'), ('of', 'life'), ('life', 'right')]


In [3]:
from nltk import FreqDist

print('Most common unigrams: ', FreqDist(unigram).most_common(5))
print('Most common bigrams: ', FreqDist(bigram).most_common(5))

Most common unigrams:  [(('gathers',), 2), (('The',), 1), (('saddest',), 1), (('aspect',), 1), (('of',), 1)]
Most common bigrams:  [(('The', 'saddest'), 1), (('saddest', 'aspect'), 1), (('aspect', 'of'), 1), (('of', 'life'), 1), (('life', 'right'), 1)]


# **Stemming**

In [4]:
from nltk.stem import PorterStemmer, SnowballStemmer
words = ["fight", "fighting", "fighter", "cows", "created"]
words_ru = ['корова', 'мальчики', 'мужчины', 'столом', 'убежала']

ps = PorterStemmer()
print(list(map(ps.stem, words)))

ss = SnowballStemmer(language='russian')
print(list(map(ss.stem, words_ru)))

['fight', 'fight', 'fighter', 'cow', 'creat']
['коров', 'мальчик', 'мужчин', 'стол', 'убежа']


# **Lemmatization**

In [5]:
import spacy

raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government.  Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

nlp = spacy.load('en')
doc = nlp(raw)
print(' '.join([token.lemma_ for token in doc]))

denni : listen , strange woman lie in pond distribute sword 
 be no basis for a system of government .   Supreme executive power derive from 
 a mandate from the masse , not from some farcical aquatic ceremony .


In [6]:
[(token.lemma_, token.pos_) for token in doc[:7]]

[('denni', 'NOUN'),
 (':', 'PUNCT'),
 ('listen', 'VERB'),
 (',', 'PUNCT'),
 ('strange', 'ADJ'),
 ('woman', 'NOUN'),
 ('lie', 'VERB')]

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
ent1 = "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom."

sentences = nltk.sent_tokenize(sent1)   
for sent in sentences:
    print(nltk.pos_tag(sent.split()))

[('The', 'DT'), ('saddest', 'JJS'), ('aspect', 'NN'), ('of', 'IN'), ('life', 'NN'), ('right', 'NN'), ('now', 'RB'), ('is', 'VBZ'), ('that', 'IN'), ('science', 'NN'), ('gathers', 'NNS'), ('knowledge', 'VBP'), ('faster', 'JJR'), ('than', 'IN'), ('society', 'NN'), ('gathers', 'NNS'), ('wisdom.', 'VBP')]


In [9]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [10]:
word_tag = nltk.pos_tag(sent.split())
new_word_tag = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in word_tag]
print(new_word_tag)

[('The', 'DET'), ('saddest', 'ADJ'), ('aspect', 'NOUN'), ('of', 'ADP'), ('life', 'NOUN'), ('right', 'NOUN'), ('now', 'ADV'), ('is', 'VERB'), ('that', 'ADP'), ('science', 'NOUN'), ('gathers', 'NOUN'), ('knowledge', 'VERB'), ('faster', 'ADJ'), ('than', 'ADP'), ('society', 'NOUN'), ('gathers', 'NOUN'), ('wisdom.', 'VERB')]


# **Named Entity Recognition**

In [11]:
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


# **RegExp**

In [13]:
import re

word = 'supercalifragilisticexpialidocious'
re.findall('[aeiou]|super', word)

['super', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']

In [14]:
re.findall('\d{1,2}', 'There is some numbers: 49 and 432')

['49', '43', '2']

In [15]:
re.sub('[,\.?!]','','How, to? split. text!')

'How to split text'

In [16]:
re.sub('[^A-z]',' ','I 123 can 45 play 67 football').split()

['I', 'can', 'play', 'football']

# **Byte Pair Encoding**

<img src="https://alexanderdyakonov.files.wordpress.com/2019/11/bpe.jpg">

In [17]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [18]:
list(newsgroups_train.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [19]:
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

In [20]:
list(newsgroups_train.target_names)

['alt.atheism', 'sci.space']

In [21]:
print(newsgroups_train.data[0])

From: bil@okcforum.osrhe.edu (Bill Conner)
Subject: Re: Not the Omni!
Nntp-Posting-Host: okcforum.osrhe.edu
Organization: Okcforum Unix Users Group
X-Newsreader: TIN [version 1.1 PL6]
Lines: 18

Charley Wingate (mangoe@cs.umd.edu) wrote:
: 
: >> Please enlighten me.  How is omnipotence contradictory?
: 
: >By definition, all that can occur in the universe is governed by the rules
: >of nature. Thus god cannot break them. Anything that god does must be allowed
: >in the rules somewhere. Therefore, omnipotence CANNOT exist! It contradicts
: >the rules of nature.
: 
: Obviously, an omnipotent god can change the rules.

When you say, "By definition", what exactly is being defined;
certainly not omnipotence. You seem to be saying that the "rules of
nature" are pre-existant somehow, that they not only define nature but
actually cause it. If that's what you mean I'd like to hear your
further thoughts on the question.

Bill



In [22]:
newsgroups_train.target[:10]

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0])

In [24]:
!pip install youtokentome

Collecting youtokentome
[?25l  Downloading https://files.pythonhosted.org/packages/a3/65/4a86cf99da3f680497ae132329025b291e2fda22327e8da6a9476e51acb1/youtokentome-1.0.6-cp36-cp36m-manylinux2010_x86_64.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 3.9MB/s 
Installing collected packages: youtokentome
Successfully installed youtokentome-1.0.6


In [27]:
import youtokentome as yttm
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()

def preproc1(text):
    return ' '.join([wnl.lemmatize(word) for word in word_tokenize(text.lower()) if word not in stopWords])

def train_bpe(records, preproc, model_path, model_type="bpe", vocab_size=10000, lower=True):
    temp_file_name = "temp.txt"
    with open(temp_file_name, "w") as temp:
        for text in records:
            temp.write(preproc(text) + "\n")

    yttm.BPE.train(data=temp_file_name, vocab_size=vocab_size, model=model_path)

train_bpe(records=newsgroups_train.data, preproc=preproc1, model_path="BPE_model.bin")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [28]:
bpe_processor = yttm.BPE('BPE_model.bin')
bpe_processor.vocab()[::1000]

['<PAD>',
 '▁gra',
 '▁observatory',
 '▁roll',
 '▁575-3539',
 '▁originator',
 '▁fred.mccall',
 'graph',
 '▁psilink-dos',
 '▁carrying']