In [None]:
pip install -q -U nltk

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag, pos_tag_sents


In [None]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("tagsets")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("words")

In [5]:
text = '''Natural Language Processing (NLP) is a field of AI that uses 
Machine Learning (ML) to help computers understand and generate human 
language. By applying algorithms to data, NLP enables tasks like text analysis, translations, 
and question answering. ML improves these systems by allowing them to learn and adapt from examples.'''

In [None]:
tokenized_sent = sent_tokenize(text, language="english")
print(type(tokenized_sent))
print(tokenized_sent)
print(len(tokenized_sent))

In [None]:
tokens = word_tokenize(text, language="english")
print(type(tokens))
print(tokens)
print(len(tokens))

In [None]:
# Remove unnecessary words that might cause noise in results
stops = stopwords.words("english")
print(stops)

In [None]:
# Remove from text the stop words
tokens = [p for p in tokens if p not in stops]
print(tokens)

In [None]:
# Remove punctuation
tokens_without_punct = [p for p in tokens if p not in string.punctuation]
print(tokens_without_punct)

In [None]:
tokens_freq = nltk.FreqDist(tokens_without_punct)
tokens_freq

In [None]:
# Most common
most_common = tokens_freq.most_common(3)
most_common

In [None]:
# Applying Stemmer by using Porter Stemmer method
porter_stemmer = PorterStemmer()
porter_stemmed  = [porter_stemmer.stem(word) for word in tokens_without_punct]
print(tokens_without_punct)
print(porter_stemmed)

In [None]:
# Applying Stemmer by using Snowball Stemmer method
snowball_stemmer = SnowballStemmer("english")
snowball_stemmed  = [snowball_stemmer.stem(word) for word in tokens_without_punct]
print(tokens_without_punct)
print(snowball_stemmed)

In [None]:
# Applying Stemmer by using Lancaster Stemmer method
lancaster_stemmer = LancasterStemmer()
lancaster_stemmed  = [lancaster_stemmer.stem(word) for word in tokens_without_punct]
print(tokens_without_punct)
print(lancaster_stemmed)

In [None]:
# Print dictionary of tagset
nltk.help.upenn_tagset()

In [None]:
pos = nltk.pos_tag(tokens_without_punct)
print(pos)

In [None]:
# Lemmatize words (Sometimes better than Stemmer)
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
print(tokens)
print(lemmatized)

In [None]:
# Named entities are the result of places, trademarks and names in the text is the result of places, trademarks and names in a text
txt_en = "Apple plans to open a new store in New York City next to Central Park by the end of October."
en_token = word_tokenize(txt_en)
tags = pos_tag(en_token)
#aqui reconhece as entidades nomeadas
en = nltk.ne_chunk(tags)
print(en)