# NLP Demystified | Preprocessing: Case_folding, Stop words Removal, Stemming, Lemmetization

In [1]:
import spacy

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the 'c:\Users\chiou\OneDrive\Bureau\Projects_CDL\NLP demystified\NLPdymenv\Scripts\python.exe -m pip install --upgrade pip' command.


In [3]:
nlp = spacy.load('en_core_web_sm')

In [5]:
s = "He told Dr. Lovato that he was done with the tests and would post the results shortly."
doc = nlp(s)

### Case-Folding

In [8]:
print([t.lower_ for t in doc])

['he', 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


In [9]:
print([t.lower_ if not t.is_sent_start else t for t in doc])

[He, 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


### Stop words removal

In [10]:
print(nlp.Defaults.stop_words)

{'becomes', 'yourself', 'already', 'among', 'though', 'thereby', 'whose', 'whereby', 'anyway', 'whether', 're', 'eleven', 'quite', 'still', 'are', '’s', 'doing', 'although', 'any', 'through', 'ten', 'once', 'or', 'serious', 'latterly', 'seeming', 'over', 'i', 'say', 'third', 'these', 'next', 'otherwise', 'it', 'would', 'such', 'his', 'several', 'within', 'out', 'made', 'thus', 'not', 'our', "'m", '’ll', 'nobody', 'whereas', "'ll", 'go', 'same', 'because', 'six', 'up', 'hers', 'nine', 'them', 'moreover', 'to', 'her', 'thereafter', 'myself', 'whatever', 'itself', 'its', 'so', 'also', 'really', 'seems', 'where', '‘re', 'indeed', 'whence', 'twenty', 'he', 'must', 'were', 'has', 'something', 'toward', 'during', 'an', '’d', 'twelve', 'ours', 'than', 'everywhere', 'beforehand', 'via', 'noone', 'anyhow', 'anything', 'never', "'s", '’re', 'alone', 'upon', 'less', 'give', '’ve', 'n’t', 'ourselves', 'now', 'regarding', 'onto', 'do', 'fifty', 'under', 'your', "n't", 'thereupon', 'somewhere', 'bott

In [11]:
print(len(nlp.Defaults.stop_words))

326


In [16]:
print([t.lower_ if not t.sent_start else t for t in doc if not t.is_stop ])

[told, Dr., Lovato, tests, post, results, shortly, .]


### Lemmetization

In [17]:
print([(t.text, t.lemma_) for t in doc])

[('He', 'he'), ('told', 'tell'), ('Dr.', 'Dr.'), ('Lovato', 'Lovato'), ('that', 'that'), ('he', 'he'), ('was', 'be'), ('done', 'do'), ('with', 'with'), ('the', 'the'), ('tests', 'test'), ('and', 'and'), ('would', 'would'), ('post', 'post'), ('the', 'the'), ('results', 'result'), ('shortly', 'shortly'), ('.', '.')]


### Exercices 

In [32]:
# EXERCISE: Find out how to intialize the SnowballStemmer, then tokenize and stem the sentence below.

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TreebankWordTokenizer
s = 'He told Dr. Lovato that he was done with the tests and would post the results shortly.'

# Initialize the stemmer here.
stemmer = SnowballStemmer("english")

# Tokenize, stem, and print the tokens.
tokenizer = TreebankWordTokenizer()
doc = tokenizer.tokenize(s)
stemmed_tokens = [stemmer.stem(token)  for token in doc]
print(stemmed_tokens)


['he', 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'test', 'and', 'would', 'post', 'the', 'result', 'short', '.']


In [68]:
## Adding custom stop words : 

# EXERCISE: Find out how to add and remove your own stop words in spaCy. Add the 
# word 'told' as a stop word, test that it works, then remove it from 
# the stop word list.

from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add('told')
print(STOP_WORDS)
print(len(STOP_WORDS))

{'becomes', 'yourself', 'already', 'among', 'though', 'thereby', 'whose', 'whereby', 'anyway', 'whether', 're', 'eleven', 'quite', 'still', 'are', '’s', 'doing', 'although', 'any', 'through', 'ten', 'once', 'or', 'serious', 'latterly', 'seeming', 'over', 'i', 'say', 'third', 'these', 'next', 'otherwise', 'it', 'would', 'such', 'his', 'several', 'within', 'out', 'made', 'thus', 'not', 'our', "'m", '’ll', 'nobody', 'whereas', "'ll", 'go', 'same', 'because', 'six', 'up', 'hers', 'told', 'nine', 'them', 'moreover', 'to', 'her', 'thereafter', 'myself', 'whatever', 'itself', 'its', 'so', 'also', 'really', 'seems', 'where', '‘re', 'indeed', 'whence', 'twenty', 'he', 'must', 'were', 'has', 'something', 'toward', 'during', 'an', '’d', 'twelve', 'ours', 'than', 'everywhere', 'beforehand', 'via', 'noone', 'anyhow', 'anything', 'never', "'s", '’re', 'alone', 'upon', 'less', 'give', '’ve', 'n’t', 'ourselves', 'now', 'regarding', 'onto', 'do', 'fifty', 'under', 'your', "n't", 'thereupon', 'somewhere

In [69]:
nlp = spacy.load("en_core_web_sm")
s = "I told you we needed more glitter very old lame meme"
doc = nlp(s)
print([t  for t in doc if not t.is_stop])

[needed, glitter, old, lame, meme]


In [70]:
STOP_WORDS.remove('told')
print(STOP_WORDS)
print(len(STOP_WORDS))

{'becomes', 'yourself', 'already', 'among', 'though', 'thereby', 'whose', 'whereby', 'anyway', 'whether', 're', 'eleven', 'quite', 'still', 'are', '’s', 'doing', 'although', 'any', 'through', 'ten', 'once', 'or', 'serious', 'latterly', 'seeming', 'over', 'i', 'say', 'third', 'these', 'next', 'otherwise', 'it', 'would', 'such', 'his', 'several', 'within', 'out', 'made', 'thus', 'not', 'our', "'m", '’ll', 'nobody', 'whereas', "'ll", 'go', 'same', 'because', 'six', 'up', 'hers', 'nine', 'them', 'moreover', 'to', 'her', 'thereafter', 'myself', 'whatever', 'itself', 'its', 'so', 'also', 'really', 'seems', 'where', '‘re', 'indeed', 'whence', 'twenty', 'he', 'must', 'were', 'has', 'something', 'toward', 'during', 'an', '’d', 'twelve', 'ours', 'than', 'everywhere', 'beforehand', 'via', 'noone', 'anyhow', 'anything', 'never', "'s", '’re', 'alone', 'upon', 'less', 'give', '’ve', 'n’t', 'ourselves', 'now', 'regarding', 'onto', 'do', 'fifty', 'under', 'your', "n't", 'thereupon', 'somewhere', 'bott

In [71]:
nlp = spacy.load("en_core_web_sm")
s = "I told you we needed more glitter very old lame meme"
doc = nlp(s)
print([t  for t in doc if not t.is_stop])

[told, needed, glitter, old, lame, meme]
