In [1]:
# NLP - Natural Language Processing

In [2]:
import nltk
#nltk.download("punkt") <- Deprecated
nltk.download("punkt_tab")

def print_divider():
    print("-" * 50, "\n")

# Sample text
text = "I love coding! Python is fun. Let's learn NLP."


# Word Tokenization
word_tokens = nltk.word_tokenize(text)
print("Word Tokens:", word_tokens)


# Sentence Tokenization
sentence_tokens = nltk.sent_tokenize(text)
print("Sentence Tokens:", sentence_tokens)


# Download POS tagger model
#nltk.download('averaged_perceptron_tagger_eng') <- Deprecated
nltk.download('averaged_perceptron_tagger_eng')

# Sample text
text = "I love coding in Python"
tokens = nltk.word_tokenize(text)

# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

# Common POS tags reference
print("\nCommon POS Tags:")
print("NN: Noun, VB: Verb, JJ: Adjective, PRP: Pronoun")
print_divider()


Word Tokens: ['I', 'love', 'coding', '!', 'Python', 'is', 'fun', '.', 'Let', "'s", 'learn', 'NLP', '.']
Sentence Tokens: ['I love coding!', 'Python is fun.', "Let's learn NLP."]
POS Tags: [('I', 'PRP'), ('love', 'VBP'), ('coding', 'VBG'), ('in', 'IN'), ('Python', 'NNP')]

Common POS Tags:
NN: Noun, VB: Verb, JJ: Adjective, PRP: Pronoun
-------------------------------------------------- 



[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\khaai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\khaai\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
# Download stop words and lemmatizer data
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Sample text
text = "The quick brown foxes are running faster"
tokens = nltk.word_tokenize(text.lower()) # Convert to lowercase

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
print("After Stop Words Removal:", filtered_tokens)

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("After Stemming:", stemmed_tokens)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("After Lemmatization:", lemmatized_tokens)
print_divider()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khaai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khaai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


After Stop Words Removal: ['quick', 'brown', 'foxes', 'running', 'faster']
After Stemming: ['quick', 'brown', 'fox', 'run', 'faster']
After Lemmatization: ['quick', 'brown', 'fox', 'running', 'faster']
-------------------------------------------------- 



In [None]:
# Install scikit-learn if needed
# !pip install scikit-learn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

# Sample documents
docs = [
 "I love coding in Python",
 "Python is great for NLP",
 "Coding is fun"
]

# Bag of Words
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(docs)
print("BoW Vocabulary:", bow_vectorizer.get_feature_names_out())
print("BoW Matrix:\n", bow_matrix.toarray())

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)
print("\nTF-IDF Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

# Simple TextRank Summarizer
def textrank_summary(text, num_sentences=2):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text
 
    # Create TF-IDF matrix for sentences
    tfidf = TfidfVectorizer().fit_transform(sentences)
    # Compute similarity matrix
    similarity_matrix = (tfidf * tfidf.T).toarray()
    # Rank sentences using PageRank
    scores = np.array([sum(row) for row in similarity_matrix])
    top_indices = scores.argsort()[-num_sentences:][::-1]
    summary = ' '.join([sentences[i] for i in sorted(top_indices)])
    return summary

# Sample text for summarization
text = "Natural language processing is a field of AI. It enables computers to understand human language. Python is widely used in NLP. NLTK is a popular library for NLP tasks."
summary = textrank_summary(text)
print("\nTextRank Summary:", summary)
print_divider()



BoW Vocabulary: ['coding' 'for' 'fun' 'great' 'in' 'is' 'love' 'nlp' 'python']
BoW Matrix:
 [[1 0 0 0 1 0 1 0 1]
 [0 1 0 1 0 1 0 1 1]
 [1 0 1 0 0 1 0 0 0]]

TF-IDF Vocabulary: ['coding' 'for' 'fun' 'great' 'in' 'is' 'love' 'nlp' 'python']
TF-IDF Matrix:
 [[0.42804604 0.         0.         0.         0.5628291  0.
  0.5628291  0.         0.42804604]
 [0.         0.49047908 0.         0.49047908 0.         0.37302199
  0.         0.49047908 0.37302199]
 [0.51785612 0.         0.68091856 0.         0.         0.51785612
  0.         0.         0.        ]]

TextRank Summary: Python is widely used in NLP. NLTK is a popular library for NLP tasks.


In [5]:
from sklearn.decomposition import LatentDirichletAllocation

# Sample documents
docs = [
 "Python is great for coding and NLP",
 "Football is a popular sport",
 "Coding in Python is fun",
 "Sports like football are exciting"
]

# Preprocess and vectorize
vectorizer = CountVectorizer(stop_words='english')
doc_matrix = vectorizer.fit_transform(docs)

# Apply LDA
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(doc_matrix)

# Display topics
words = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda.components_):
 top_words = [words[j] for j in topic.argsort()[-5:]]
 print(f"Topic {i+1}: {top_words}")


Topic 1: ['fun', 'nlp', 'great', 'coding', 'python']
Topic 2: ['sport', 'like', 'sports', 'exciting', 'football']
