***TOKENIZATION***

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt_tab')

text="""Artificial Intelligence (AI) is performing industries worldwide. With its applications in healthcare, finance, and education, AI provides innovative solutions."""

# Sentence and word tokenization
def tokenize_text(text):
    print("--- Tokenization ---")


    # Uses the sent_tokenize function from the nltk.tokenize module to split
    # The input text into a list of sentences.
    print("Sentence Tokenizaation:")
    sentences = sent_tokenize(text)
    print(sentences)

    # Uses the word_tokenize function from the nltk.tokenize module to split
    # Input text into a list of words (or tokens).
    print("\nWord Tokenization:")
    words = word_tokenize(text)
    print(words)


    # Calls the tokenize_text function with the text variable as input
    tokenize_text(text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


***STEMMING***

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

text = "Artificial Intelligence is transforming industries worldwide."
tokenlist = word_tokenize(text)

# Perform stemming
def stem_text(tokenlist):

    print("--- Stemming ---")
    stemmer = PorterStemmer()

    # Iterates over each element in the tokenlist
    stemmed_words = [stemmer.stem(word) for word in tokenlist]
    print("Stemmed words:")
    print(stemmed_words)

stem_text(tokenlist)

--- Stemming ---
Stemmed words:
['artifici', 'intellig', 'is', 'transform', 'industri', 'worldwid', '.']


***LEMMATIZATION***

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('wordnet')

text = "Artificial Intelligence is transforming industries worldwide."
words = word_tokenize(text)

# Perform Lemmatization
def lemmatize_text(words):
    print("--- Lemmatization ---")
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    print("Lemmatized Words:")
    print(lemmatized_words)

lemmatize_text(words)

[nltk_data] Downloading package wordnet to /root/nltk_data...


--- Lemmatization ---
Lemmatized Words:
['Artificial', 'Intelligence', 'is', 'transforming', 'industry', 'worldwide', '.']


***REMOVE STOP WORDS***

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('stopwords')

text = "Artificial Intelligence is transforming industries worldwide."
words = word_tokenize(text)

# Identify and remove stop words
def identify_stop_words(words):
    print("--- Stop Words ---")

    # Retrieves a predefined list of stop words from the nltk.corpus.stopwords
    # Module for the English language
    stop_words = set(stopwords.words("english"))

    # COnverts the word to lowercase using word.lower()
    # Check if the lowercase word is not in the stop_words set or not
    # If the word is not a stop word, it is included in filtered_words
    filtered_words = [word for word in words if word.lower() not in stop_words]

    print("Filtered Words (without stop words):")
    print(filtered_words)

identify_stop_words(words)

--- Stop Words ---
Filtered Words (without stop words):
['Artificial', 'Intelligence', 'transforming', 'industries', 'worldwide', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


***POS TAGGING***

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')

text = "Artificial Intelligence is transforming industries worldwide."
words = word_tokenize(text)

# Perform POS tagging
def pos_tagging(words):
    print("--- POS Taggging ---")
    pos_tags = nltk.pos_tag(words)
    print("POS Tags:")
    print(pos_tags)

pos_tagging(words)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


--- POS Taggging ---
POS Tags:
[('Artificial', 'JJ'), ('Intelligence', 'NNP'), ('is', 'VBZ'), ('transforming', 'VBG'), ('industries', 'NNS'), ('worldwide', 'RB'), ('.', '.')]


***TEST PARSING AND BUILD PARSING TREE***

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Artificial Intelligence is transforming industries worldwide."

# Perform dependency parsing
def dependency_parsing(text):
    print("--- Dependency Parsing ---")
    doc = nlp(text)
    for token in doc:
        new_var = print(f"Word: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}, POS: {token.pos_}")

dependency_parsing(text)


# To produce parsing true
from spacy import displacy

# Load the ENglish language model
nlp = spacy.load("en_core_web_sm")

# Input_text
text = "Artificial Intelligence is transforming industries worldwide."

# Prcess the next
doc = nlp(text)

# Display the dependency tree in the notebook
displacy.render(doc, style="dep", jupyter=True)

--- Dependency Parsing ---
Word: Artificial, Dependency: compound, Head: Intelligence, POS: PROPN
Word: Intelligence, Dependency: nsubj, Head: transforming, POS: PROPN
Word: is, Dependency: aux, Head: transforming, POS: AUX
Word: transforming, Dependency: ROOT, Head: transforming, POS: VERB
Word: industries, Dependency: dobj, Head: transforming, POS: NOUN
Word: worldwide, Dependency: advmod, Head: transforming, POS: ADV
Word: ., Dependency: punct, Head: transforming, POS: PUNCT


***NAMED ENTITY RECOGNITION***

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Artificial Intelligence is transforming industries worldwide."

# Perform Nmed Entity Recognition
def named_entity_recognition(text):
    print ("--- Named Entity Recognition ---")

    # Pass input text to SpaCy LM(nlp) and return a doc onject.
    # DOc object contains tokens, linguistic annotations, NE from the text.
    doc = nlp(text)
    print(doc)

    # Iterates over all the named entities in the Doc object using doc.ents.
    for ent in doc.ents:
        print(f"Entity: {ent.text}, label: {ent.label_}")

named_entity_recognition(text)


# **3. TASKS**

In [None]:
import nltk
from nltk.corpus import gutenberg, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist

# Step 1: Load and Explore the Dataset
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

print("--- Available Texts in Gutenberg Corpus ---")
print(gutenberg.fileids())

# Select a text
text = gutenberg.raw('austen-emma.txt')
print("\n--- FIrst 500 CHaracters of the text ---")
print(text[:500])

# Step 2: Tokenization
sentences = sent_tokenize(text)
words = word_tokenize(text)

print("\n--- Number of Sentence ---")
print(len(sentences))
print("--- Number of Words ---")
print(len(words))

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


--- Available Texts in Gutenberg Corpus ---
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

--- FIrst 500 CHaracters of the text ---
[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had 