## 5. How to clean Text manually with NLTK



In [1]:
import re

file = open('alice.txt', 'rt')
text = file.read()
file.close()

words = re.split(r'\W+', text)
print(words[:100])

['', 'ALICE', 'S', 'ADVENTURES', 'IN', 'WONDERLAND', 'Lewis', 'Carroll', 'THE', 'MILLENNIUM', 'FULCRUM', 'EDITION', '3', '0', 'CHAPTER', 'I', 'Down', 'the', 'Rabbit', 'Hole', 'Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', 'and', 'of', 'having', 'nothing', 'to', 'do', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', 'thought', 'Alice', 'without', 'pictures', 'or', 'conversation', 'So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', 'as', 'well', 'as', 'she', 'could', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and']


### 5.1. Hashing with HashingVectorizer

convert vocabulary into integer, because large size required large memory to store, low down the algorithm. The vector spare array values are normalized word counts by default. 

In [11]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=20)

vector = vectorizer.transform(words)
display(vector.shape, list(vector[:10].toarray()))

(27346, 20)

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0., -1.,  0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0.]),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0., -1.,  0.]),
 array([ 0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
     

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
# build vocab and tokenize
vectorizer.fit(words)

print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(words)
print(vector.shape, vector[:10].toarray())

(27346, 2564) [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [18]:
## Keras text processing
from tensorflow.keras.preprocessing.text import text_to_word_sequence
# split by space, filters out punctuation, to lowercase
words = text_to_word_sequence(text)
display(len(words), words[:30])

27788

["alice's",
 'adventures',
 'in',
 'wonderland',
 'lewis',
 'carroll',
 'the',
 'millennium',
 'fulcrum',
 'edition',
 '3',
 '0',
 'chapter',
 'i',
 'down',
 'the',
 'rabbit',
 'hole',
 'alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by',
 'her',
 'sister']

In [17]:
## encoding with one-hot
from tensorflow.keras.preprocessing.text import one_hot

vocab_size = len(words)
result = one_hot(text, round(vocab_size * 1.3))
display(len(result), result[:20])

27788

[14980,
 24267,
 18694,
 13264,
 16527,
 10074,
 19898,
 35735,
 21217,
 17909,
 29363,
 25196,
 30929,
 11234,
 4198,
 19898,
 35875,
 13323,
 11226,
 12707]

## 8 Bag-of-Words Model
Modeling text with ML algorithms, to extract features in the text. The bag-of-words is a representation of text that describes the occurrence of words within a document. involves vocab and measure presented words 

### 8.1. Limitation of BoW
#### Vocabulary
requires careful design, the size impacts the sparsity of the document.

#### Sparsity
Sparse representations are harder to model both for computation (sparse and time complexity) and also for information reasons. 

#### Meaning
word meaning (semantics) does not contribute in the BoW. Context and meaning can offer a lot to the model, that if model can tell the difference between words, such as synonyms, position,...  

In [33]:
from nltk import sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
import nltk
import re

In [27]:
re_punc = re.compile('[%s]' % re.escape(punctuation))
sentences = sent_tokenize(text)
tokens = [word_tokenize(sent) for sent in sentences]
tokens = [[w.lower() for w in sent] for sent in tokens]
tokens = [[re_punc.sub('', w) for w in sent] for sent in tokens]
tokens = [[w for w in sent if not w in set(stopwords.words('english'))] for sent in tokens]
tokens = [[w for w in sent if w.isalpha()] for sent in tokens]
tokens = [[w for w in sent if len(w) > 2] for sent in tokens]
display(len(tokens))

1616

In [28]:
print(tokens[:5])

[['alice', 'adventures', 'wonderland', 'lewis', 'carroll', 'millennium', 'fulcrum', 'edition', 'chapter', 'rabbithole', 'alice', 'beginning', 'get', 'tired', 'sitting', 'sister', 'bank', 'nothing', 'twice', 'peeped', 'book', 'sister', 'reading', 'pictures', 'conversations', 'use', 'book', 'thought', 'alice', 'without', 'pictures', 'conversation'], ['considering', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepy', 'stupid', 'whether', 'pleasure', 'making', 'daisychain', 'would', 'worth', 'trouble', 'getting', 'picking', 'daisies', 'suddenly', 'white', 'rabbit', 'pink', 'eyes', 'ran', 'close'], ['nothing', 'remarkable', 'alice', 'think', 'much', 'way', 'hear', 'rabbit', 'say', 'dear'], ['dear'], ['shall', 'late']]


In [29]:
## stemming words
porter = PorterStemmer()
stemm_tokens = [[porter.stem(w) for w in sent] for sent in tokens]
print(stemm_tokens[:5])

[['alic', 'adventur', 'wonderland', 'lewi', 'carrol', 'millennium', 'fulcrum', 'edit', 'chapter', 'rabbithol', 'alic', 'begin', 'get', 'tire', 'sit', 'sister', 'bank', 'noth', 'twice', 'peep', 'book', 'sister', 'read', 'pictur', 'convers', 'use', 'book', 'thought', 'alic', 'without', 'pictur', 'convers'], ['consid', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepi', 'stupid', 'whether', 'pleasur', 'make', 'daisychain', 'would', 'worth', 'troubl', 'get', 'pick', 'daisi', 'suddenli', 'white', 'rabbit', 'pink', 'eye', 'ran', 'close'], ['noth', 'remark', 'alic', 'think', 'much', 'way', 'hear', 'rabbit', 'say', 'dear'], ['dear'], ['shall', 'late']]


In [31]:
## words frequency 
freq = defaultdict(int)
for sent in stemm_tokens:
    for w in sent:
        freq[w] += 1
        
freq = dict([(k, v) for k, v in freq.items() if v > 1])
keywords = freq.keys()
display(len(keywords))

1130

In [32]:
## BoW
stemm_tokens = [[w for w in sent if w in keywords] for sent in stemm_tokens]
print(stemm_tokens[:10])

[['alic', 'adventur', 'wonderland', 'edit', 'chapter', 'rabbithol', 'alic', 'begin', 'get', 'tire', 'sit', 'sister', 'bank', 'noth', 'twice', 'peep', 'book', 'sister', 'read', 'pictur', 'convers', 'use', 'book', 'thought', 'alic', 'without', 'pictur', 'convers'], ['consid', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepi', 'stupid', 'whether', 'pleasur', 'make', 'would', 'worth', 'troubl', 'get', 'pick', 'suddenli', 'white', 'rabbit', 'eye', 'ran', 'close'], ['noth', 'remark', 'alic', 'think', 'much', 'way', 'hear', 'rabbit', 'say', 'dear'], ['dear'], ['shall', 'late'], ['thought', 'afterward', 'occur', 'ought', 'wonder', 'time', 'seem', 'quit', 'natur', 'rabbit', 'took', 'watch', 'pocket', 'look', 'hurri', 'alic', 'start', 'feet', 'across', 'mind', 'never', 'seen', 'rabbit', 'either', 'watch', 'take', 'burn', 'curios', 'ran', 'across', 'time', 'see', 'larg', 'rabbithol', 'hedg'], ['anoth', 'moment', 'went', 'alic', 'never', 'consid', 'world', 'get'], ['rabbithol', 'went

In [35]:
## words sentiment with word classes
stemm_tokens = [[w for w in nltk.pos_tag(sent)] for sent in stemm_tokens]
print(stemm_tokens[:10])

[[('alic', 'JJ'), ('adventur', 'NN'), ('wonderland', 'NN'), ('edit', 'NN'), ('chapter', 'NN'), ('rabbithol', 'NN'), ('alic', 'JJ'), ('begin', 'NN'), ('get', 'VB'), ('tire', 'JJ'), ('sit', 'NN'), ('sister', 'NN'), ('bank', 'NN'), ('noth', 'DT'), ('twice', 'JJ'), ('peep', 'NN'), ('book', 'NN'), ('sister', 'NN'), ('read', 'VBP'), ('pictur', 'NN'), ('convers', 'NNS'), ('use', 'VBP'), ('book', 'NN'), ('thought', 'VBN'), ('alic', 'RB'), ('without', 'IN'), ('pictur', 'NN'), ('convers', 'NNS')], [('consid', 'NN'), ('mind', 'NN'), ('well', 'RB'), ('could', 'MD'), ('hot', 'VB'), ('day', 'NN'), ('made', 'VBN'), ('feel', 'NN'), ('sleepi', 'JJ'), ('stupid', 'JJ'), ('whether', 'IN'), ('pleasur', 'JJ'), ('make', 'NN'), ('would', 'MD'), ('worth', 'VB'), ('troubl', 'JJ'), ('get', 'VB'), ('pick', 'JJ'), ('suddenli', 'NN'), ('white', 'JJ'), ('rabbit', 'NN'), ('eye', 'NN'), ('ran', 'VBD'), ('close', 'RB')], [('noth', 'DT'), ('remark', 'NN'), ('alic', 'JJ'), ('think', 'VBP'), ('much', 'JJ'), ('way', 'NN'),

In [36]:
features = [[w for w in nltk.pos_tag(sent)] for sent in tokens]
print(features[:10])

[[('alice', 'NN'), ('adventures', 'NNS'), ('wonderland', 'VBP'), ('lewis', 'JJ'), ('carroll', 'NN'), ('millennium', 'NN'), ('fulcrum', 'JJ'), ('edition', 'NN'), ('chapter', 'NN'), ('rabbithole', 'JJ'), ('alice', 'NN'), ('beginning', 'VBG'), ('get', 'VB'), ('tired', 'JJ'), ('sitting', 'VBG'), ('sister', 'JJ'), ('bank', 'NN'), ('nothing', 'NN'), ('twice', 'RB'), ('peeped', 'VBD'), ('book', 'NN'), ('sister', 'NN'), ('reading', 'NN'), ('pictures', 'NNS'), ('conversations', 'NNS'), ('use', 'VBP'), ('book', 'NN'), ('thought', 'NN'), ('alice', 'NN'), ('without', 'IN'), ('pictures', 'NNS'), ('conversation', 'NN')], [('considering', 'VBG'), ('mind', 'NN'), ('well', 'NN'), ('could', 'MD'), ('hot', 'VB'), ('day', 'NN'), ('made', 'VBN'), ('feel', 'NN'), ('sleepy', 'JJ'), ('stupid', 'JJ'), ('whether', 'IN'), ('pleasure', 'NN'), ('making', 'NN'), ('daisychain', 'NN'), ('would', 'MD'), ('worth', 'VB'), ('trouble', 'NN'), ('getting', 'VBG'), ('picking', 'VBG'), ('daisies', 'NNS'), ('suddenly', 'RB'), 

In [37]:
required_class = ['N']
features = [[w[0] for w in sent if w[1][0] in required_class] for sent in features]
print(features[:10])

[['alice', 'adventures', 'carroll', 'millennium', 'edition', 'chapter', 'alice', 'bank', 'nothing', 'book', 'sister', 'reading', 'pictures', 'conversations', 'book', 'thought', 'alice', 'pictures', 'conversation'], ['mind', 'well', 'day', 'feel', 'pleasure', 'making', 'daisychain', 'trouble', 'daisies', 'rabbit', 'eyes'], ['nothing', 'alice', 'way', 'rabbit', 'dear'], ['dear'], [], ['thought', 'afterwards', 'time', 'rabbit', 'watch', 'waistcoat', 'pocket', 'alice', 'feet', 'mind', 'rabbit', 'waistcoatpocket', 'watch', 'burning', 'curiosity', 'field', 'time', 'rabbithole', 'hedge'], ['moment', 'world', 'get'], ['way', 'moment'], ['time', 'wonder'], ['look', 'dark', 'anything', 'sides', 'cupboards', 'bookshelves', 'maps', 'pictures', 'pegs']]


In [43]:
freq_features = defaultdict(int)
for sent in features:
    for w in sent:
        freq_features[w] += 1
print(freq_features)



In [44]:
freq_features = dict([(k, v) for k, v in freq_features.items() if v > 1])
freq_features = freq_features.keys() 
display(len(freq_features))

610

In [45]:
features = [[(w, (w in freq_features)) for w in sent] for sent in tokens]
print(features[:10])

[[('alice', True), ('adventures', True), ('wonderland', False), ('lewis', False), ('carroll', False), ('millennium', False), ('fulcrum', False), ('edition', False), ('chapter', True), ('rabbithole', False), ('alice', True), ('beginning', False), ('get', True), ('tired', False), ('sitting', False), ('sister', True), ('bank', True), ('nothing', True), ('twice', False), ('peeped', False), ('book', True), ('sister', True), ('reading', False), ('pictures', True), ('conversations', False), ('use', True), ('book', True), ('thought', True), ('alice', True), ('without', False), ('pictures', True), ('conversation', True)], [('considering', False), ('mind', True), ('well', True), ('could', False), ('hot', False), ('day', True), ('made', False), ('feel', True), ('sleepy', False), ('stupid', False), ('whether', False), ('pleasure', False), ('making', False), ('daisychain', False), ('would', False), ('worth', False), ('trouble', True), ('getting', False), ('picking', False), ('daisies', False), ('su

In [47]:
features = [dict(sent) for sent in features]
print(features)



In [50]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.model_selection import train_test_split

train, test = train_test_split(features, test_size=0.2)

display(len(train), len(test))

1292

324

In [1]:
# svc_classifier = SklearnClassifier(SVC())
# svc_classifier.train(train)
# print(f'Test acc= {nltk.classify.accuracy(svc_classifier, test)}')
### failed execution ==> features, cat to classify