In [1]:
# importing libraries

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
text = "Artificial Intelligence (AI) is the technology that enables machines to mimic human thinking and decision-making. It learns from data, identifies patterns, and improves performance over time. AI helps automate tasks, enhance accuracy, and solve complex problems across many industries."

In [3]:
# word tokenization
words = word_tokenize(text)
print("Word Tokenization:", words)

Word Tokenization: ['Artificial', 'Intelligence', '(', 'AI', ')', 'is', 'the', 'technology', 'that', 'enables', 'machines', 'to', 'mimic', 'human', 'thinking', 'and', 'decision-making', '.', 'It', 'learns', 'from', 'data', ',', 'identifies', 'patterns', ',', 'and', 'improves', 'performance', 'over', 'time', '.', 'AI', 'helps', 'automate', 'tasks', ',', 'enhance', 'accuracy', ',', 'and', 'solve', 'complex', 'problems', 'across', 'many', 'industries', '.']


In [4]:
sentences = sent_tokenize(text)
print("Sentence Tokenization:", sentences)

Sentence Tokenization: ['Artificial Intelligence (AI) is the technology that enables machines to mimic human thinking and decision-making.', 'It learns from data, identifies patterns, and improves performance over time.', 'AI helps automate tasks, enhance accuracy, and solve complex problems across many industries.']


In [5]:
print("Number of Words", len(words))
print("Number of Sentences", len(sentences))

Number of Words 48
Number of Sentences 3


In [6]:
for sentence in sent_tokenize(text):
  for word in word_tokenize(sentence):
    print("word_tokenization",word)

word_tokenization Artificial
word_tokenization Intelligence
word_tokenization (
word_tokenization AI
word_tokenization )
word_tokenization is
word_tokenization the
word_tokenization technology
word_tokenization that
word_tokenization enables
word_tokenization machines
word_tokenization to
word_tokenization mimic
word_tokenization human
word_tokenization thinking
word_tokenization and
word_tokenization decision-making
word_tokenization .
word_tokenization It
word_tokenization learns
word_tokenization from
word_tokenization data
word_tokenization ,
word_tokenization identifies
word_tokenization patterns
word_tokenization ,
word_tokenization and
word_tokenization improves
word_tokenization performance
word_tokenization over
word_tokenization time
word_tokenization .
word_tokenization AI
word_tokenization helps
word_tokenization automate
word_tokenization tasks
word_tokenization ,
word_tokenization enhance
word_tokenization accuracy
word_tokenization ,
word_tokenization and
word_tokenizati

# Stemming

In [7]:
from nltk.stem import PorterStemmer

In [8]:
stemmer = PorterStemmer()

words = ["running", "jump", "easily", "fairly", "studies"]

stems = [stemmer.stem(word) for word in words]

print("stems:", stems)

stems: ['run', 'jump', 'easili', 'fairli', 'studi']


# Lemmatization

In [9]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
Lemmatizer = WordNetLemmatizer()

words = ["running", "jump", "easily", "fairly", "studies"]

In [11]:
def get_wordnet_pos(word):
  tag = pos_tag([word])[0] [1] [0].upper()
  tag_dict = {"J": wordnet.ADJ,
              "N": wordnet.NOUN,
              "V": wordnet.VERB,
              "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

In [12]:
lemmas = [Lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
print("lemmas:", lemmas)

lemmas: ['run', 'jump', 'easily', 'fairly', 'study']


# Remove Stop Words

In [13]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
text = "This is an example sentence that showing off stop word filteration."
words = word_tokenize(text)
stop_words = set(stopwords.words("english"))
filtered_sentence = [w for w in words if not w.lower() in stop_words]

print("Orginal sentence", words)
print("Filtered sentence", filtered_sentence)

Orginal sentence ['This', 'is', 'an', 'example', 'sentence', 'that', 'showing', 'off', 'stop', 'word', 'filteration', '.']
Filtered sentence ['example', 'sentence', 'showing', 'stop', 'word', 'filteration', '.']


# NER (Named Entity Recognition)

In [15]:
import spacy
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [16]:
nlp = spacy.load("en_core_web_sm")

text = "Apple is looking at buying U.K. startup for $1 billion"

doc = nlp(text)

entities = [(ent.text, ent.label_) for ent in doc.ents]

print("Entities:", entities)


Entities: [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]
