In [21]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import spacy

In [2]:
nltk.download('punkt') # punkt -> sentence and word tokenizer model
nltk.download('punkt_tab')
nltk.download('stopwords') # all the common stopword 'a', 'in' etc.
nltk.download('wordnet') # WordNet is a lexical database used to find the base form of words

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
text = "Natural Language Processing is very interesting subject. This is another sentence"

In [5]:
# Tokensization
print(sent_tokenize(text))
print(word_tokenize(text))

['Natural Language Processing is very interesting subject.', 'This is another sentence']
['Natural', 'Language', 'Processing', 'is', 'very', 'interesting', 'subject', '.', 'This', 'is', 'another', 'sentence']


In [6]:
# remove stop words
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in word_tokenize(text) if w.lower() not in stop_words]
print(filtered_words)

['Natural', 'Language', 'Processing', 'interesting', 'subject', '.', 'another', 'sentence']


In [17]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word.lower(), pos='v') for word in filtered_words]

In [18]:
print(lemmatized)

['natural', 'language', 'process', 'interest', 'subject', '.', 'another', 'sentence']


In [22]:
# POS tagging

# tagging each word with it's grammatical role
tokens = word_tokenize("Apple is opening a new office in India.")
print(pos_tag(tokens))

[('Apple', 'NNP'), ('is', 'VBZ'), ('opening', 'VBG'), ('a', 'DT'), ('new', 'JJ'), ('office', 'NN'), ('in', 'IN'), ('India', 'NNP'), ('.', '.')]


In [26]:
# simplify things using spacy - more powerful 
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is planning to open a new office in India.")

In [27]:
for token in doc: 
    print(token.text, token.pos_)

Apple PROPN
is AUX
planning VERB
to PART
open VERB
a DET
new ADJ
office NOUN
in ADP
India PROPN
. PUNCT


In [30]:
for entity in doc.ents: 
    print(entity.text, "->", entity.label_)  

Apple -> ORG
India -> GPE
