<a href="https://colab.research.google.com/github/gitanjali16122004/NLP-Mastery/blob/main/Tokenization_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Tokenization

In [None]:
import nltk
import spacy

from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
text = "Elon Musk announced a new AI project last week."
tokens = text.split()
print(tokens)

['Elon', 'Musk', 'announced', 'a', 'new', 'AI', 'project', 'last', 'week.']


In [None]:
text = "Elon Musk announced a new AI project. It was a breakthrough."
sentences = text.split(".")
sentences = [s.strip() for s in sentences if s.strip()]
print(sentences)

['Elon Musk announced a new AI project', 'It was a breakthrough']


In [None]:
#This is not handle the abbrevation like "Dr.","U.S"
text = "Dr. Elon Musk announced a new AI project. It was a breakthrough."
sentences = text.split(".")
sentences = [s.strip() for s in sentences if s.strip()]
print(sentences)

['Dr', 'Elon Musk announced a new AI project', 'It was a breakthrough']


In [None]:
word_tokenize("Hii,How are you?")

['Hii', ',', 'How', 'are', 'you', '?']

In [None]:
corpus = [
    "Elon Musk, the CEO of Tesla and SpaceX, announced a new AI project last week.",
    "The Eiffel Tower in Paris attracts millions of tourists every year.",
    "NASA successfully landed the Perseverance rover on Mars in 2021.",
    "The stock market saw a sharp decline due to economic instability.",
    "COVID-19 had a significant impact on global healthcare and economies.",
    "The Amazon rainforest plays a crucial role in Earth's climate system.",
    "Artificial Intelligence and Machine Learning are transforming industries worldwide."
]



In [None]:
# Function for NLTK tokenization
def nltk_tokenization(text):
    words = word_tokenize(text)  # Word Tokenization
    sentences = sent_tokenize(text)  # Sentence Tokenization
    return words, sentences

# Apply tokenization to the corpus
for i, text in enumerate(corpus):
    nltk_words, nltk_sentences = nltk_tokenization(text)

    print(f"\nSentence {i+1}: {text}")
    print("NLTK Word Tokens:", nltk_words)
    print("NLTK Sentence Tokens:", nltk_sentences)


Sentence 1: Elon Musk, the CEO of Tesla and SpaceX, announced a new AI project last week.
NLTK Word Tokens: ['Elon', 'Musk', ',', 'the', 'CEO', 'of', 'Tesla', 'and', 'SpaceX', ',', 'announced', 'a', 'new', 'AI', 'project', 'last', 'week', '.']
NLTK Sentence Tokens: ['Elon Musk, the CEO of Tesla and SpaceX, announced a new AI project last week.']

Sentence 2: The Eiffel Tower in Paris attracts millions of tourists every year.
NLTK Word Tokens: ['The', 'Eiffel', 'Tower', 'in', 'Paris', 'attracts', 'millions', 'of', 'tourists', 'every', 'year', '.']
NLTK Sentence Tokens: ['The Eiffel Tower in Paris attracts millions of tourists every year.']

Sentence 3: NASA successfully landed the Perseverance rover on Mars in 2021.
NLTK Word Tokens: ['NASA', 'successfully', 'landed', 'the', 'Perseverance', 'rover', 'on', 'Mars', 'in', '2021', '.']
NLTK Sentence Tokens: ['NASA successfully landed the Perseverance rover on Mars in 2021.']

Sentence 4: The stock market saw a sharp decline due to economic

## Stopword Removal

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Hello, world! How are you?"
doc = nlp(text)
tokens = [token.text for token in doc]
print(tokens)

['Hello', ',', 'world', '!', 'How', 'are', 'you', '?']


In [None]:
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()
text = "Hello @user! 😊 #HappyDay"
tokens = tokenizer.tokenize(text)
print(tokens)

['Hello', '@user', '!', '😊', '#HappyDay']


In [None]:

from nltk.corpus import stopwords
# Download required resources

nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:


corpus = [
    "Elon Musk, the CEO of Tesla and SpaceX, announced a new AI project last week.",
    "The Eiffel Tower in Paris attracts millions of tourists every year.",
    "NASA successfully landed the Perseverance rover on Mars in 2021.",
    "The stock market saw a sharp decline due to economic instability.",
    "COVID-19 had a significant impact on global healthcare and economies.",
    "The Amazon rainforest plays a crucial role in Earth's climate system.",
    "Artificial Intelligence and Machine Learning are transforming industries worldwide."
]

In [None]:
# Get English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    words = word_tokenize(text)  # Tokenize the sentence
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return filtered_words

# Apply stopword removal to the corpus
for i, text in enumerate(corpus):
    filtered_tokens = remove_stopwords(text)

    print(f"\nOriginal Sentence {i+1}: {text}")
    print("Filtered Tokens (Without Stopwords):", filtered_tokens)
    print("-" * 80)


Original Sentence 1: Elon Musk, the CEO of Tesla and SpaceX, announced a new AI project last week.
Filtered Tokens (Without Stopwords): ['Elon', 'Musk', ',', 'CEO', 'Tesla', 'SpaceX', ',', 'announced', 'new', 'AI', 'project', 'last', 'week', '.']
--------------------------------------------------------------------------------

Original Sentence 2: The Eiffel Tower in Paris attracts millions of tourists every year.
Filtered Tokens (Without Stopwords): ['Eiffel', 'Tower', 'Paris', 'attracts', 'millions', 'tourists', 'every', 'year', '.']
--------------------------------------------------------------------------------

Original Sentence 3: NASA successfully landed the Perseverance rover on Mars in 2021.
Filtered Tokens (Without Stopwords): ['NASA', 'successfully', 'landed', 'Perseverance', 'rover', 'Mars', '2021', '.']
--------------------------------------------------------------------------------

Original Sentence 4: The stock market saw a sharp decline due to economic instability.
Fi