In [3]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Text
text = """
Natural language processing (NLP) is a subfield of linguistics, computer 
science, and artificial intelligence. 
It involves the interactions between computers and humans using the natural 
language. The ultimate objective 
of NLP is to read, decipher, understand, and make sense of the human language in
a valuable way. It started 
in the 1950s, although work can be found from earlier periods. In 1950, Alan 
Turing published an article titled 
"Computing Machinery and Intelligence" which proposed what is now called the 
Turing test as a criterion of 
intelligence, a task that involves the automated interpretation and generation 
of natural language, but at the 
time not articulated as a problem separate from artificial intelligence. The 
premise of symbolic NLP is 
well-summarized by John Searle's Chinese room experiment: Given a collection of 
rules (e.g., a Chinese phrasebook, 
with questions and matching answers), the computer emulates natural language 
understanding (or other NLP tasks) 
by applying those rules to the data it is confronted with. 2023 is the year when
NLP got its major breakthrough.
"""

# Task 1: Tokenization
def tokenize_text(text):
    return word_tokenize(text)

# Task 2: Stop Word Removal
def remove_stop_words(tokens):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

# Task 3: Stemming
def perform_stemming(filtered_tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return stemmed_tokens

# Task 4: Lemmatization
def perform_lemmatization(filtered_tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return lemmatized_tokens

# Now use the functions to process the text
tokens = tokenize_text(text)
filtered_tokens = remove_stop_words(tokens)
stemmed_tokens = perform_stemming(filtered_tokens)
lemmatized_tokens = perform_lemmatization(filtered_tokens)

print("Tokenized tokens:", tokens)
print("Filtered tokens (after stop word removal):", filtered_tokens)
print("Stemmed tokens:", stemmed_tokens)
print("Lemmatized tokens:", lemmatized_tokens)


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...


Tokenized tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', '.', 'It', 'involves', 'the', 'interactions', 'between', 'computers', 'and', 'humans', 'using', 'the', 'natural', 'language', '.', 'The', 'ultimate', 'objective', 'of', 'NLP', 'is', 'to', 'read', ',', 'decipher', ',', 'understand', ',', 'and', 'make', 'sense', 'of', 'the', 'human', 'language', 'in', 'a', 'valuable', 'way', '.', 'It', 'started', 'in', 'the', '1950s', ',', 'although', 'work', 'can', 'be', 'found', 'from', 'earlier', 'periods', '.', 'In', '1950', ',', 'Alan', 'Turing', 'published', 'an', 'article', 'titled', "''", 'Computing', 'Machinery', 'and', 'Intelligence', "''", 'which', 'proposed', 'what', 'is', 'now', 'called', 'the', 'Turing', 'test', 'as', 'a', 'criterion', 'of', 'intelligence', ',', 'a', 'task', 'that', 'involves', 'the', 'automated', 'interpretation', 'and', 'generation', 'of'