# Text Preprocessing 
### convert text features into numerical features ML models can work with

## 1. General text pre-processing methods

In [1]:
text = "   This is a message to be cleaned. It may involve some things like: <br>, ?, :, ''  adjacent spaces and tabs     .  "

### get rid of leading/trailing whitespace

In [7]:
text = text.strip().lower()
print(text)

this is a message to be cleaned. it may involve some things like: <br>, ?, :, ''  adjacent spaces and tabs     .


### Remove HTML tag, markups

In [17]:
import re
text = re.compile('<.*?>').sub('', text)
print(text)

 This message be cleaned. It may involve some things like: , ?, :, '' adjacent spaces tabs . 


### Replace punctuation with space

In [18]:
import string
text = re.compile('[%s]' % re.escape(string.punctuation)).sub('', text)
print(text)

 This message be cleaned It may involve some things like     adjacent spaces tabs  


### Remove extra spaces and tabs

In [19]:
text = re.sub('\s+', ' ', text)
print(text)

 This message be cleaned It may involve some things like adjacent spaces tabs 


## 2. Lexicon-based text pre-processing
### normalize sentences in the dataset so that sentences are in a similar format

### Stop word removal
#### remove words in our sentences that occur very frequently and don't contribute too much to the overall meaning of the sentences.

In [23]:
stop_words = ["a", "an", "the", "this", "that", "is", "it", "to", "and"]
words = text.lower().split(' ')
filtered_sentence = []
for w in words: 
    if w not in stop_words: 
        filtered_sentence.append(w)
print(filtered_sentence)    

['', 'message', 'be', 'cleaned', 'may', 'involve', 'some', 'things', 'like', 'adjacent', 'spaces', 'tabs', '']


In [32]:
text = ' '.join(filtered_sentence)

In [33]:
print(text)

 message be cleaned may involve some things like adjacent spaces tabs 


### Stemming: Stemming is a rule-based system to convert words into their root form.
#### It removes suffixes from words. This helps us enhace similarities (if any) between sentences.

In [27]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 6.0 MB/s eta 0:00:00
Collecting joblib
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     ---------------------------------------- 298.0/298.0 KB ? eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)


You should consider upgrading via the 'C:\Users\cinna\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


     ---------------------------------------- 78.5/78.5 KB 4.3 MB/s eta 0:00:00
Collecting click
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
     ---------------------------------------- 96.6/96.6 KB ? eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp310-cp310-win_amd64.whl (267 kB)
     ------------------------------------- 267.7/267.7 KB 16.1 MB/s eta 0:00:00
Installing collected packages: tqdm, regex, joblib, click, nltk
Successfully installed click-8.1.3 joblib-1.2.0 nltk-3.8.1 regex-2022.10.31 tqdm-4.64.1


In [34]:
import nltk
from nltk.stem import SnowballStemmer

#Inialized the stemmer 
snow = SnowballStemmer('english')
stemmed_sentence = []
words = text.split(' ')
for w in words: 
    stemmed_sentence.append(snow.stem(w))
text = ' '.join(stemmed_sentence)
print(text)

 messag be clean may involv some thing like adjac space tab 


## 3. Feature extraction - Bag of words
### Steps:
* Create vocabulary of known words
* Measure presence of the known words in sentences

In [39]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary = True)
sentences = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?'
]
X = cv.fit_transform(sentences)

In [4]:
"""There are 9 unique words in the sentences above, the 9 words are indexed from 0 to 8 by the CountVectorizer, i.e. "and" is at 0, "document" is at 1, "first" is at 2. 
The array shows whether each sentence has an occurence of the word 
"""
print(cv.vocabulary_)
print(X.toarray())

{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 1 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


In [6]:
"""When new words appear in the test set, it won't be counted by the CountVectorizer. The array will have the same length as the previous one. 
"""
test_sentences = ["this document has some new words",
                 "this one is new too"]
X_test = cv.transform(test_sentences)
print(X_test.toarray())

[[0 1 0 0 0 0 0 0 1]
 [0 0 0 1 1 0 0 0 1]]


## 4. Full Example of text pre-processing steps 

In [22]:
import re
import string
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer('english')
stop_words = ["a", "an", "the", "this", "that", "is", "it", "to", "and"]

#function that corries out all the general text pre-processing steps, lower case, strip leading/trailing whitespace, remove HTML tag, punctuation, and extra white space
def preProcess(text):
    text = text.lower().strip()
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub('', text)
    text = re.sub('\s+', ' ', text)
    return text 

#function that carries out lexicon based text pre-processing steps, remove stop words, stemming
def lexiconProcess(text, stop_words, stemmer):
    filtered_sentence = []
    words = text.split(' ')
    for w in words: 
        if w not in stop_words:
            filtered_sentence.append(snowball.stem(w))
    text = ' '.join(filtered_sentence)
    return text 
    
#function
def cleanSentence(text, stop_words, stemmer):
    return lexiconProcess(preProcess(text), stop_words, stemmer)

In [15]:
text = "   This is a message to be cleaned. It may involve some things like: <br>, ?, :, ''  adjacent spaces and tabs     .  "

In [26]:
print(cleanSentence(text, stop_words, snowball))

messag be clean may involv some thing like adjac space tab 


In [27]:
#prepare bag of words vectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary = True, max_features = 50) #limit vocabulary size

In [29]:
# Clean and vectorize a text feature with four samples
text_feature = ["I liked the material, color and overall how it looks.<br /><br />",
             "Worked okay first two times I used it, but third time burned my face.",
             "I am not sure about this product.",
             "I never thought I would pay so much for a hair dryer.",
            ]


#Apply text pre-processing steps
text_clean = [cleanSentence(item, stop_words, snowball) for item in text_feature]

#Apply text vectorization on the cleaned text

text_vectorized = cv.fit_transform(text_clean)
print('Vocabulary \n', cv.vocabulary_)
print('Bag of Words binary features: \n', text_vectorized.toarray())

print(text_vectorized.shape)

Vocabulary 
 {'like': 11, 'materi': 13, 'color': 4, 'overal': 19, 'how': 10, 'look': 12, 'work': 29, 'okay': 18, 'first': 7, 'two': 27, 'time': 26, 'use': 28, 'but': 3, 'third': 24, 'burn': 2, 'my': 15, 'face': 6, 'am': 1, 'not': 17, 'sure': 23, 'about': 0, 'product': 21, 'never': 16, 'thought': 25, 'would': 30, 'pay': 20, 'so': 22, 'much': 14, 'for': 8, 'hair': 9, 'dryer': 5}
Bag of Words binary features: 
 [[0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1]]
(4, 31)
