### The spaCy library can handle many NLP tasks, including tokenization, lemmatization, stop words, and more
- The first step is to turn a text string into a spaCy doc object

In [3]:
import pandas as pd

# create a list of sentences
data = [
    "When life gives you lemons, make lemonade! 🙂",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon — there's a great sale today.",
    "Does Maven Market carry Eureka lemons or Meyer lemons?",
    "An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",
    "iced tea is my favorite"
]

# expand the column width to see the full sentences
pd.set_option('display.max_colwidth', None)

# turn it into a dataframe
data_df = pd.DataFrame(data, columns=["sentence"])

# create a test series of sentences for later on
test = [
    "We're going to start this course with traditional NLP applications.",
    "Then we'll move on to modern NLP theory.",
    "Finally, we'll wrap things up with modern NLP applications."
]

test_series = pd.Series(test)

# make a copy of the data in case we mess up later on
df = data_df.copy()

# lowercase text
df['sentence_clean'] = df['sentence'].str.lower()

# remove text between brackets, including the brackets
# ChatGPT: use str.replace with regex=true on a series to replace all text within brackets including the brackets, with an empty string
df['sentence_clean'] = df['sentence_clean'].str.replace(r'\[.*?\]', '', regex=True)

# remove punctuation
# ChatGPT: use str.replace with regex=true on a series to replace all punctuation with an empty string
df['sentence_clean'] = df['sentence_clean'].str.replace(r'[^\w\s]', '', regex=True)

# put all text preprocessing steps into a function to better organize our code
def lower_replace(series):
    output = series.str.lower()
    output = output.str.replace(r'\[.*?\]', '', regex=True) # remove words in brackets
    output = output.str.replace(r'[^\w\s]', '', regex=True) # remove punctuation
    return output

# try it out on our test series
lower_replace(test_series)

# use the lower_remove function - the output has lowercased letters, no words in brackets and no punctuation
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

### Text Preprocessing with spaCy

In [4]:
# view our dataframe once again
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [19]:
import spacy
nlp = spacy.load("en_core_web_sm")
print("Model loaded successfully!")

Model loaded successfully!


In [5]:
# load the spacy english model
# run this code in the command line if you get an error: python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
# look at just one phrase
phrase = df.sentence_clean[0]
phrase

'when life gives you lemons make lemonade '

In [7]:
# turn the phrase into a spacy document
doc = nlp(phrase)
doc

when life gives you lemons make lemonade 

### Tokenization lets you break text up into smaller units, like words
- Text strings are often split by whitespace to make tokens
- Sentence tokenization, word tokenization

In [8]:
# break up the text into tokens
[token.text for token in doc]

['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']

### Tokenization using nltk toolkit - install nltk
- !pip install nltk
- import nltk
  nltk.download('popular')
  nltk.download('all')

In [20]:
import nltk
print("NLTK installed successfully!")

NLTK installed successfully!


In [16]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
#nltk.download('punkt')
# Sample text for tokenization
txt = "NLTK provides powerful tools for tokenization. It includes word tokenization and sentence tokenization"

In [17]:
# Word tokenization
words = word_tokenize(txt)
print(words)

['NLTK', 'provides', 'powerful', 'tools', 'for', 'tokenization', '.', 'It', 'includes', 'word', 'tokenization', 'and', 'sentence', 'tokenization']


In [18]:
# Word tokenization
sent = sent_tokenize(txt)
print(sent)

['NLTK provides powerful tools for tokenization.', 'It includes word tokenization and sentence tokenization']


In [21]:
from nltk.tokenize import word_tokenize

input_str = "hello how are$ you!!"
tokens = word_tokenize(input_str)
print(tokens)

['hello', 'how', 'are', '$', 'you', '!', '!']


In [22]:
import nltk

#nltk.download('punkt')

from nltk.tokenize import word_tokenize
import string


input_str = "hello how are$ you!!"

# Tokenize
tokens = nltk.word_tokenize(input_str)

# Remove the special characters
clean_tokens = [token for token in tokens if token.isalnum()]

clean_str = ' '.join(clean_tokens)

print(clean_str)

hello how are you


### Lemmatization reduces words to their base form
- spaCy uses a combination of linguistic rules and statistical models to lemmatize text

In [9]:
# in addition, lemmatize the tokens to their root form
[token.lemma_ for token in doc]

['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']

### Stop words are words without any significant meaning
- You can view the full stop word list in spaCy with the code print(nlp.Defaults.stop_words)

### Stop words are words that are filtered out before or after processing natural language data because they are deemed to have little semantic value or are otherwise insignificant for the task at hand. 

Why Remove Stop Words?
- Noise Reduction: Removing stop words helps reduce the amount of irrelevant data, allowing models to focus on more meaningful words. 
innovationyourself.com

- Improved Computational Efficiency: Eliminating stop words reduces the size of the dataset, leading to faster processing and lower computational costs.

- Enhanced Model Performance: By focusing on content-rich words, models can achieve better accuracy in tasks like text classification and sentiment analysis.

In [10]:
# in addition, remove the stop words
norm = [token.lemma_ for token in doc if not token.is_stop]
norm

['life', 'give', 'lemon', 'lemonade']

In [11]:
# side note: view the spacy stop word list
list(nlp.Defaults.stop_words)[:10]

['anything',
 'somewhere',
 'hence',
 'became',
 'front',
 'toward',
 'but',
 'thru',
 'along',
 'amongst']

In [12]:
# convert the list into a string for easier analysis later on
' '.join(norm)

'life give lemon lemonade'

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample Sentence
sentence = "This is a sample sentence, showing off the stop words filtration"
# Tokenize the Sentence
#nltk.download('punkt')
#nltk.download('stopwords')
words = word_tokenize(sentence)

# Filter out stopwords
new_sentence = [word for word in words if word.lower() not in stopwords.words('english')]
# Print the final sentence
print(sentence)
print(new_sentence)

This is a sample sentence, showing off the stop words filtration
['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration']


### Create a token_lemma_nonstop function

In [13]:
# put all text preprocessing steps into a function to better organize our code
def token_lemma_nonstop(text):
    doc = nlp(text)
    output = [token.lemma_ for token in doc if not token.is_stop] # tokenize, lemmatize and remove stop words
    output = ' '.join(output) # convert list into string
    return output

In [14]:
# try it out on our test series, this time using .apply
test_series.apply(token_lemma_nonstop)

0    go start course traditional NLP application .
1                              modern NLP theory .
2    finally , wrap thing modern NLP application .
dtype: object

In [15]:
# apply the function on a column of text - the output is tokenized, lemmatized and has no stop words
lower_replace(df.sentence).apply(token_lemma_nonstop)

0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2                    dozen lemon gallon lemonade
3            lemon lemon lemon lemon lemon lemon
4        s run market lemon   s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: sentence, dtype: object

### Parts of speech (POS) tagging lets you label nouns, verbs, etc. within text data
- This is optional, but is sometimes used as a filtering technique to only look at nouns and pronouns for analysis

In [25]:
# look at just one phrase
phrase2 = lower_replace(df.sentence).apply(token_lemma_nonstop)[0]
phrase2

'life give lemon lemonade'

In [26]:
# turn the phrase into a spacy document
doc2 = nlp(phrase2)
doc2

life give lemon lemonade

In [27]:
# view the parts of speech tags
pos = [(token.text, token.pos_) for token in doc2]
pos

[('life', 'NOUN'), ('give', 'VERB'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [28]:
# filter on just the nouns and proper nouns
nouns = [(token.text) for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]
nouns

['life', 'lemon', 'lemonade']

In [29]:
# convert the list into a string for easier analysis later on
' '.join(nouns)

'life lemon lemonade'

### Create a filter_pos function

In [31]:
# filter by parts of speech
def filter_pos(text, pos_list=['NOUN', 'PROPN']):
    doc = nlp(text)
    output = [(token.text) for token in doc if token.pos_ in pos_list] # return on the nouns and pronouns
    output = ' '.join(output) # convert list into string
    return output

In [32]:
# try it out on our test data
test_series.apply(filter_pos)

0    course NLP applications
1                 NLP theory
2    things NLP applications
dtype: object

In [33]:
# apply the function on a column of text - the output only includes nouns and proper nouns
lower_replace(df.sentence).apply(token_lemma_nonstop).apply(filter_pos)

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object

In [34]:
# notice that by switching the order of the normalization steps, we get different results
lower_replace(df.sentence).apply(filter_pos).apply(token_lemma_nonstop)

0              life lemon lemonade
1               lemon maven market
2      dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon
4          market lemon sale today
5        market eureka lemon lemon
6              palmer lemonade tea
7                              tea
Name: sentence, dtype: object

## Create an NLP Pipeline

In [36]:
# copy down the helper functions
def lower_replace(series):
    output = series.str.lower()
    output = output.str.replace(r'\[.*?\]', '', regex=True)
    output = output.str.replace(r'[^\w\s]', '', regex=True)
    return output

def token_lemma_nonstop(text):
    doc = nlp(text)
    output = [token.lemma_ for token in doc if not token.is_stop]
    output = ' '.join(output)
    return output 

def filter_pos(text, pos_list=['NOUN', 'PROPN']):
    doc = nlp(text)
    output = [(token.text) for token in doc if token.pos_ in pos_list]
    output = ' '.join(output)
    return output

In [37]:
# create an nlp pipeline
def nlp_pipeline(series):
    output = lower_replace(series)
    output = output.apply(token_lemma_nonstop)
    output = output.apply(filter_pos)
    return output

In [38]:
# view the cleaned, normalized and filtered test sentences
nlp_pipeline(test_series)

0          nlp application
1               nlp theory
2    thing nlp application
dtype: object

In [39]:
# view the original sentences
df.sentence

0                               When life gives you lemons, make lemonade! 🙂
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon — there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [40]:
# view the cleaned and normalized sentences
text_clean = nlp_pipeline(df.sentence)
text_clean

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object

### Save your cleaned data as a pickle file

In [41]:
# save the output as a pickle file to load into a notebook later on
pd.to_pickle(text_clean, '../data/text_clean.pkl')