# Stop Words
Stop words are those words that do not contribute to the deeper meaning of the phrase. They are the most common words such as: the, a, and is. For some applications like documentation classification, it may make sense to remove stop words. NLTK provides a list of commonly agreed upon stop words for a variety of languages, such as English.. 

In [1]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'n’t', 'say', 'behind', 'wherever', 'throughout', 'each', 'about', 'every', 'almost', 'been', 'give', 'after', 'did', 'elsewhere', 'regarding', 'itself', 'n‘t', '’m', 'nobody', 'though', 'others', 'during', 'can', 'everything', '’ve', 'anyone', 'if', 'unless', 'at', 'call', 'where', 'for', 'go', 'same', 'again', 'most', 'sometimes', 'thru', 'he', 'there', 'whatever', 'by', 'than', 'ever', 'towards', 'therefore', 'even', 'would', 'among', 'moreover', 'someone', 'none', "'ll", 'my', 'side', 'everyone', 'his', 'against', 'thereafter', 'upon', 'full', 'while', 'under', 'least', 'until', 'whoever', 'keep', 'still', 'our', 'thus', 'themselves', 'well', 'on', 'somewhere', 'from', '‘ve', 'third', 'those', 'fifteen', 'all', 'herself', 'due', 'this', 'amongst', 'get', 'is', 'via', 'what', 'very', 'hereby', 'serious', 'its', '’re', 'enough', 'hence', "n't", 'your', 'and', 'seems', 'her', 'bottom', 'latter', 'not', 'quite', 'since', 'something', 'whence', 'using', 'forty', 'cannot', 'has', 'seemi

In [4]:
len(nlp.Defaults.stop_words)

326

## To see if a word is a stop word

In [5]:
nlp.vocab['myself'].is_stop

True

In [6]:
nlp.vocab['mystery'].is_stop

False

In [7]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('mystery')

In [8]:
# Set the stop_word tag on the lexeme(group of words)
nlp.vocab['mystery'].is_stop = True

In [10]:
len(nlp.Defaults.stop_words)

327

In [11]:
nlp.vocab['mystery'].is_stop

True

## To remove a stop word
Alternatively, you may decide that `'beyond'` should not be considered a stop word.

In [12]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [13]:
len(nlp.Defaults.stop_words)

326

In [14]:
nlp.vocab['beyond'].is_stop

False

In [15]:
import string
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
# load data
text = 'The Quick brown fox jump over the lazy dog!'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [16]:
# split into words
tokens = word_tokenize(text)
print(tokens)

['The', 'Quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [17]:
# convert to lower case
tokens = [w.lower() for w in tokens]
print(tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [18]:
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
print(re_punc)

re.compile('[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]')


In [19]:
# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in tokens]
print(stripped)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '']


In [20]:
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
print(words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']


In [21]:
# filter out non-stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [23]:
nlp.vocab['brown'].is_stop

False