# Tips : Some of the text preprocessing techniques,

- Tokenization
- Lemmatization
- Removing Punctuations and Stopwords
- Part of Speech Tagging
- Entity Recognition

## Tokenization:

In [7]:
#importing libraries
import spacy
#instantiating English module
nlp = spacy.load("../Dependency Parsing Project/en_core_web_sm/en_core_web_sm-3.4.1")
#sample
x = "Embracing and analyzing self failures (of however multitude) is a virtue of nobelmen."
#creating doc object containing our token features
doc = nlp(x)
#Creating and updating our list of tokens using list comprehension 
tokens = [token.text for token in doc]
print(tokens)

['Embracing', 'and', 'analyzing', 'self', 'failures', '(', 'of', 'however', 'multitude', ')', 'is', 'a', 'virtue', 'of', 'nobelmen', '.']


In [11]:
nlp = spacy.load("../Dependency Parsing Project/en_core_web_sm/en_core_web_sm-3.4.1")
#Creating the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')
# Adding the component to the pipeline
nlp.add_pipe(sbd)
x = "Embracing and analyzing self failures (of however multitude) is a virtue of nobelmen. And nobility is a treasure few possess."
#creating doc object carring our sentence tokens
doc = nlp(x)
#Creating and updating our list of tokens using list comprehension 
tokens = [token for token in doc.sents]
print(tokens)

ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <spacy.pipeline.sentencizer.Sentencizer object at 0x0000020665DEBA00> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

## Lemmatization

In [12]:
#sample
x = "Running down the street with my best buddy."
#creating doc object containing our token features
doc = nlp(x)
#Creating and updating our list of tokens using list comprehension 
tokens = [[token.text,token.lemma_] for token in doc]
print(tokens)

[['Running', 'run'], ['down', 'down'], ['the', 'the'], ['street', 'street'], ['with', 'with'], ['my', 'my'], ['best', 'good'], ['buddy', 'buddy'], ['.', '.']]


## Removing Stop Words

In [13]:
from spacy.lang.en.stop_words import STOP_WORDS
stop = STOP_WORDS
print(stop)

{'once', 'enough', 'be', 'first', 'herself', 'against', 'many', 'whence', 'here', 'nowhere', 'most', 'hundred', 'somewhere', 'out', 'she', 'of', 'via', '’m', 'amount', 'latter', 'or', 'i', 'empty', "'d", 'not', 'just', 'everything', 'several', 'whereby', 'less', 'two', 'those', 'this', 'an', 'you', 'every', 'both', 'everywhere', 'thereupon', 'well', 'her', 'hers', 'then', 'forty', 'until', 'seeming', 'is', 'do', 'mine', 'themselves', 'whom', 'seems', 'can', 'few', 'yourself', "n't", 'itself', 'could', 'either', 'top', 'rather', 'moreover', 'whatever', 'after', 'although', 'which', 'them', 'again', 'anyway', 'since', '‘s', 'where', 'alone', 'however', 'by', "'re", 'bottom', 'does', 'along', 'without', 'something', 'from', 'because', 'therein', 'n‘t', 'part', 'used', 'why', 'has', 'thence', 'thereafter', 'twelve', 'therefore', 'four', 'name', 'was', 'show', 'whereas', 'seem', 'in', 'unless', 'more', 'throughout', 'up', 'one', 'really', 'these', 'whole', 'between', 'eight', 'everyone', 'b

In [14]:
#sample
x = "Running down the street with my best buddy."
#creation of doc object containing our token features
doc = nlp(x)
#Creating and updating our list of tokens using list comprehension 
tokens = [token.text for token in doc]
print(tokens)
#Creating and updating our list of filtered tokens using list comprehension 
filtered = [token.text for token in doc if token.is_stop == False]
print(filtered)

['Running', 'down', 'the', 'street', 'with', 'my', 'best', 'buddy', '.']
['Running', 'street', 'best', 'buddy', '.']


In [15]:
#sample 
x = "BLIMEY!! Such an exhausting day, I can't even describe."
#creation of doc object containing our token features
doc = nlp(x)
#Unfiltered tokens 
tokens = [token.text for token in doc]
print(tokens)
#Filtering our tokens
filtered = [token.text for token in doc if token.is_stop == False and       
token.text.isalpha() == True]
print(filtered)

['BLIMEY', '!', '!', 'Such', 'an', 'exhausting', 'day', ',', 'I', 'ca', "n't", 'even', 'describe', '.']
['BLIMEY', 'exhausting', 'day', 'describe']


## Part-of-Speech Tagging (POS)

In [16]:
#sample
x = "Robin is an astute programmer"
#Creating doc object
doc = nlp(x)
#Extracting POS
pos = [[token.text,token.pos_] for token in doc]
print (pos)

[['Robin', 'PROPN'], ['is', 'AUX'], ['an', 'DET'], ['astute', 'ADJ'], ['programmer', 'NOUN']]
