In [3]:
import nltk
# pip install nltk
import spacy
# pip install spacy

In [4]:
import nltk # Download necessary NLTK resources - 
            # NLTK is a leading platform for building Python programs to work with human language data.
            # NLTK full form is Natural Language Toolkit.
from nltk.tokenize import word_tokenize, sent_tokenize # Tokenization
from nltk.corpus import stopwords # Stopwords, common words to ignore
from nltk.stem import PorterStemmer, WordNetLemmatizer # Stemming and Lemmatization

In [5]:
import nltk
nltk.download('punkt')  # Download sentence tokenizer models
# punkt is a pre-trained model for tokenizing text into sentences and words.
# punkt means "punctuation" and is used to identify sentence boundaries.
nltk.download('punkt_tab') # punkt_tab is a tab-separated version of the punkt tokenizer, useful for specific applications.
nltk.download('stopwords') # Download stopwords corpus
nltk.download('wordnet') # Download WordNet corpus for lemmatization

# only need to run once

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
import nltk
save_path = r'E:\nltk_data'
nltk.data.path.append(save_path)

nltk.download('punkt', download_dir=save_path)
nltk.data.path.append(save_path)

# Example 2: Sentence tokenization
from nltk.tokenize import sent_tokenize

text = "Hello there! How are you doing today?"
sentences = sent_tokenize(text)
print(sentences)


['Hello there!', 'How are you doing today?']


[nltk_data] Downloading package punkt to E:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Difference between nltk and spacy:
# # NLTK is a comprehensive library for NLP tasks, while spaCy is designed for production use with a focus on speed and efficiency.
# NLTK provides more linguistic data and tools, while spaCy offers pre-trained models and pipelines for quick deployment.

# What is common between nltk and spacy:
# Both NLTK and spaCy are popular libraries for natural language processing in Python, 
# providing tools for tokenization, part-of-speech tagging, named entity recognition, and more.
# They can be used together in a project to leverage the strengths of both libraries.


import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print([token.text for token in doc])

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
['Hello', 'there', '!', 'How', 'are', 'you', 'doing', 'today', '?']


In [8]:
# Example 3: Word tokenization
words = word_tokenize(text)
print(words)

['Hello', 'there', '!', 'How', 'are', 'you', 'doing', 'today', '?']


In [9]:
# Example 4: Remove stopwords
stop_words = set(stopwords.words('english'))
# Stopwords are common words that are often ignored in text processing, such as "the", "is", "in", etc.
# They are removed to focus on the more meaningful words in the text.
print(stop_words)
filtered_words = [w for w in words if w.lower() not in stop_words]


# # Example 4: Remove stopwords
# stop_words = set(stopwords.words('english'))
# # Stopwords are common words that are often ignored in text processing, such as "the", "is", "in", etc.
# # They are removed to focus on the more meaningful words in the text.
# print(stop_words)
# # filtered_words = [w for w in stop_words if w.lower() not in stop_words]
# # print(filtered_words)
# # print(filtered_words)

{'yours', "didn't", 'did', 'herself', "mightn't", 'most', 'll', "shouldn't", 'are', 'ma', 'few', "won't", "he's", 'whom', 'when', 'against', 'of', "wouldn't", 'no', "he'd", 'its', "she'd", "hadn't", "you'd", "wasn't", 'all', 'weren', 'until', 'themselves', "should've", 'was', 'more', 'into', "shan't", 'himself', 'be', 'by', 'other', "it'd", 'up', 'don', 'couldn', 'out', 'm', "isn't", 'very', 'too', 'aren', 'each', 'nor', 'shan', 'down', 'it', 'than', 'is', 'such', 'from', 'these', 'theirs', 'while', 'isn', 's', "aren't", "it's", "that'll", 'at', "they'll", "doesn't", 'o', 'them', 'as', 'hasn', 'been', 'their', 'own', 'only', 'both', 'hadn', 'my', 'there', 'not', 'were', 'can', 'i', 'with', 'yourselves', 'or', 'who', 'a', "haven't", 'am', 'needn', "we'd", 'mustn', 'shouldn', 'being', 'where', 'does', 'had', 'about', 'will', 'her', "you've", 'and', 'what', 'that', 'which', 'won', 'doesn', 'ain', "we're", 'me', 'd', 'so', "she'll", 'haven', 'hers', 'didn', 'again', 'here', "you're", 'migh

In [10]:


# Example 5: Stemming
ps = PorterStemmer()
stemmed_words = [ps.stem(w) for w in filtered_words]
print(stemmed_words)

['hello', '!', 'today', '?']


In [11]:
# Example 6: Lemmatization
# Lemmatization is the process of reducing a word to its base or root form.
# It considers the context and converts the word to its meaningful base form.
# For example, "running" becomes "run", "better" becomes "good".
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_words]
print(lemmatized_words)

['Hello', '!', 'today', '?']


In [None]:
# Example 7: POS tagging
nltk.download('averaged_perceptron_tagger_eng')
pos_tags = nltk.pos_tag(words)
print(pos_tags)

# [('Hello', 'NNP'), ('there', 'EX'), ('!', '.'), ('How', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('doing', 'VBG'), ('today', 'NN'), ('?', '.')]
# NNP: Proper noun, singular
# EX: Existential there
# WRB: Wh-adverb (e.g., where, when)
# VBP: Verb, non-3rd person singular present
# PRP: Personal pronoun
# VBG: Verb, gerund or present participle
# NN: Noun, singular or mass
# .: Punctuation mark


[('Hello', 'NNP'), ('there', 'EX'), ('!', '.'), ('How', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('doing', 'VBG'), ('today', 'NN'), ('?', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
# Example 8: Named Entity Recognition (NER)
# NER is the process of identifying and classifying named entities in text, such as people, organizations, locations, etc.
# Example: "Barack Obama was the 44th President of the United States." 
# NER would identify "Barack Obama" as a person and "United States" as a location.

# NLTK provides a named entity chunker that can be used for this purpose.
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
from nltk import ne_chunk
tree = ne_chunk(pos_tags)
print(tree)

# The output will be a tree structure where named entities are labeled 
# with their types (e.g., PERSON, ORGANIZATION, GPE for geopolitical entity).

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


(S
  (GPE Hello/NNP)
  there/EX
  !/.
  How/WRB
  are/VBP
  you/PRP
  doing/VBG
  today/NN
  ?/.)


In [18]:
# Example 9: Frequency distribution of words
freq_dist = nltk.FreqDist(words)
print(freq_dist.most_common(5))

[('Hello', 1), ('there', 1), ('!', 1), ('How', 1), ('are', 1)]


In [15]:
# Example 10: Concordance
text_obj = nltk.Text(words)
text_obj.concordance("today") # Finds occurrences of the word "today" in the text


Displaying 1 of 1 matches:
     Hello there ! How are you doing today ?
