<h1>2 Text Preprocessing</h1>

In [2]:
import nltk

<h1>Preprocessing Is Comprised Mainly of Three Steps:</h1>
<ul>
    <li>Noise Removal</li>
    <li>Lexicon Normalization</li>
    <li>Object Standardization</li>
</ul>

<h1>2.1 Noise Removal</h1>

<h1>Remove Noisy Words from Text Using Standard Methods</h1>

In [3]:
noise_list = ["is", "a", "this"]
sample_text = "This is some sample text."

def remove_noise(input_text):
    words = input_text.split()
    scrubbed = " ".join([word.lower() for word in words if word.lower()
                         not in noise_list])
    return scrubbed

In [4]:
remove_noise(sample_text)

'some sample text.'

<h1>Remove Noisy Words from Text Using Regular Expressions</h1>

In [5]:
import re
reg_ex = "#[\w]*"
sample_text = "This is a sentence with a #hashtag"

def remove_noise_regex(input_text, regex_pattern):
    reg_iter = re.finditer(regex_pattern, input_text)
#     for i in reg_iter:
#         input_text = re.sub(i.group().strip(), '', input_text)
    input_text = [re.sub(i.group().strip(), '', input_text)
                  for i in reg_iter]
    return ''.join(input_text)
    

In [6]:
remove_noise_regex(sample_text, reg_ex)

'This is a sentence with a '

<h1>2.2 Lexicon Normalization</h1>
<p>The same word can have multiple representations, Ex: "play", "player", "played", "plays", "playing".</p>
<p>Normalization: Convert all disparities of a word into a normalized form (aka lemma).</p>
<p>This converts high dimensional features into a low dimensional space (1 feature).</p>
<h3>Common Lexicon Normalization Practices:</h3>
<ul>
    <li>Stemming: rule-based process of stripping suffixes ("ing", "ly", "es", "s", etc)</li>
    <li>Lemmatization: Organized, step-by-step procedure of obtaining the root form of a word. Uses vocabulary (dictionary importance) and morphological analysis (word structure and grammar relations).</li>
</ul>

<h1>NLTK lemmatization and stemming</h1>

In [7]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# NOTE: OPTIONAL - this is just me goofing around
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

lem = WordNetLemmatizer()
stem = PorterStemmer()



In [8]:
climbing = "climbing"
climbs = "climbs"
climber = "climber"
climbers = "climbers"

# NOTE: OPTIONAL - this is just me goofing around
climb_ls = ' '.join([climbing, climbs, climber, climbers])
climb_tokenized = pos_tag(word_tokenize(climb_ls))
climb_tokenized


[('climbing', 'VBG'),
 ('climbs', 'NNS'),
 ('climber', 'NN'),
 ('climbers', 'NNS')]

In [9]:
multiplying = "multiplying"
multiplies = "multiplies"
multiplier = "multiplier"
multipliers = "multipliers"

# NOTE: OPTIONAL - this is just me goofing around
mult_ls = ' '.join([multiplying, multiplies, multiplier,
                    multipliers])
mult_tokenized = pos_tag(word_tokenize(mult_ls))
mult_tokenized


[('multiplying', 'VBG'),
 ('multiplies', 'NNS'),
 ('multiplier', 'JJR'),
 ('multipliers', 'NNS')]

<h1>Lemmatize</h1>

In [10]:
print(lem.lemmatize(climbing, "v"))
print(lem.lemmatize(climbs, "v"))
print(lem.lemmatize(climber, "n"))
print(lem.lemmatize(climbers, "n"))
print("+++++++++++++++++++++++++++++++++++")
print(lem.lemmatize(multiplying, "v"))
print(lem.lemmatize(multiplies, "v"))
print(lem.lemmatize(multiplier, "n"))
print(lem.lemmatize(multiplier, "n"))

climb
climb
climber
climber
+++++++++++++++++++++++++++++++++++
multiply
multiply
multiplier
multiplier


<h1>Stem</h1>

In [11]:
print(stem.stem(climbing))
print(stem.stem(climbs))
print(stem.stem(climber))
print(stem.stem(climbers))
print("+++++++++++++++++++++++++++++++++++")
print(stem.stem(multiplying))
print(stem.stem(multiplies))
print(stem.stem(multiplier))
print(stem.stem(multiplier))

climb
climb
climber
climber
+++++++++++++++++++++++++++++++++++
multipli
multipli
multipli
multipli


<h1>2.3 Object Standardization</h1>
<p>Text data can contain words or phrases that aren't in standard lexical dictionaries - these words will often not be recognized.</p>
<p>Examples:</p>
<ul>
    <li>acronyms</li>
    <li>hashtags</li>
    <li>colloquial slangs</li>
</ul>
    

In [12]:
lookup_dict = {"rt": "retweet", "dm": "direct message",
               "awsm": "awesome", "luv": "love"}
sample_text = "I'm going to rt that dm because it was awsm I loved it"

def lookup_words(input_text):
    words = input_text.split()
    new_words = []
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word)
    return " ".join(new_words)
    

In [13]:
lookup_words(sample_text)

"I'm going to retweet that direct message because it was awesome I loved it"