In [None]:
# How can you perform word tokenization using NLTK?
import nltk
from nltk.tokenize import word_tokenize

# Download the punkt tokenizer models
nltk.download('punkt')

# Example sentence
sentence = "Natural Language Processing is amazing!"

# Tokenize the sentence into words
words = word_tokenize(sentence)
print(words)


In [None]:
# How can you perform sentence tokenization using NLTK?
from nltk.tokenize import sent_tokenize

# Example text
text = "Natural Language Processing is amazing! It has many applications."

# Tokenize the text into sentences
sentences = sent_tokenize(text)
print(sentences)


In [None]:
# How can you remove stopwords from a sentence?
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the stopwords dataset
nltk.download('stopwords')

# Example sentence
sentence = "This is an example sentence with stopwords."

# Tokenize the sentence
words = word_tokenize(sentence)

# Get the list of stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from the tokenized sentence
filtered_words = [word for word in words if word.lower() not in stop_words]
print(filtered_words)


In [None]:
# How can you perform stemming on a word?
from nltk.stem import PorterStemmer

# Create a Porter Stemmer object
stemmer = PorterStemmer()

# Example word
word = "running"

# Perform stemming
stemmed_word = stemmer.stem(word)
print(stemmed_word)


In [None]:
# How can you perform lemmatization on a word?
from nltk.stem import WordNetLemmatizer

# Create a WordNet Lemmatizer object
lemmatizer = WordNetLemmatizer()

# Example word
word = "better"

# Perform lemmatization
lemmatized_word = lemmatizer.lemmatize(word, pos='a')  # 'a' stands for adjective
print(lemmatized_word)


In [None]:
# How can you normalize a text by converting it to lowercase and removing punctuation?
import string

# Example text
text = "Hello! How are you today?"

# Convert to lowercase
text = text.lower()

# Remove punctuation
text = text.translate(str.maketrans("", "", string.punctuation))

print(text)


In [None]:
# How can you create a co-occurrence matrix for words in a corpus?
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Example corpus
corpus = [
    "Natural Language Processing is amazing.",
    "Machine learning is a subfield of artificial intelligence.",
    "Text processing includes tokenization, lemmatization, and stemming."
]

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the model and transform the corpus into a document-term matrix
X = vectorizer.fit_transform(corpus)

# Convert the result into a co-occurrence matrix
co_occurrence_matrix = (X.T * X)  # Transpose and multiply
co_occurrence_matrix = co_occurrence_matrix.toarray()

# Get the words (features) in the matrix
words = vectorizer.get_feature_names_out()

print(co_occurrence_matrix)


In [None]:
# How can you apply a regular expression to extract all email addresses from a text?
import re

# Example text
text = "You can reach out to us at support@example.com or sales@company.org."

# Define the regular expression for email addresses
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}'

# Extract email addresses using the pattern
emails = re.findall(email_pattern, text)
print(emails)


In [None]:
# How can you perform word embedding using Word2Vec?
from gensim.models import Word2Vec

# Example sentences (corpus)
sentences = [["natural", "language", "processing", "is", "fun"],
             ["machine", "learning", "is", "cool"],
             ["text", "mining", "is", "interesting"]]

# Train a Word2Vec model
model = Word2Vec(sentences, min_count=1)

# Get the word vector for a specific word
vector = model.wv['language']
print(vector)


In [None]:
# How can you use Doc2Vec to embed documents?
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

# Example corpus with tags for each document
documents = [
    TaggedDocument(words=["natural", "language", "processing"], tags=["doc1"]),
    TaggedDocument(words=["machine", "learning", "is", "great"], tags=["doc2"]),
    TaggedDocument(words=["text", "mining", "is", "fun"], tags=["doc3"])
]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=20, window=2, min_count=1, workers=4)

# Get the vector representation of a document
vector = model.infer_vector(["machine", "learning", "is", "great"])
print(vector)


In [None]:
# How can you perform part-of-speech tagging?
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Example sentence
sentence = "Natural language processing is fascinating."

# Tokenize the sentence
words = word_tokenize(sentence)

# Perform POS tagging
tagged_words = pos_tag(words)
print(tagged_words)


In [None]:
# How can you find the similarity between two sentences using cosine similarity?
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example sentences
sentence1 = "Natural language processing is amazing."
sentence2 = "NLP is a field of artificial intelligence."

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Transform the sentences into vectors
vectors = vectorizer.fit_transform([sentence1, sentence2])

# Calculate cosine similarity
similarity = cosine_similarity(vectors[0], vectors[1])
print(similarity)


In [None]:
# How can you extract named entities from a sentence?
import spacy

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

# Example sentence
sentence = "Apple is looking to buy a startup in the UK for $1 billion."

# Process the sentence with spaCy
doc = nlp(sentence)

# Extract named entities
named_entities = [(entity.text, entity.label_) for entity in doc.ents]
print(named_entities)


In [None]:
# How can you split a large document into smaller chunks of text?
def split_text(text, chunk_size=500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Example large document
document = "This is a very long document. " * 100  # Example long text

# Split the document into smaller chunks
chunks = split_text(document)
print(chunks[:2])  # Display the first two chunks


In [None]:
# How can you calculate the TF-IDF (Term Frequency - Inverse Document Frequency) for a set of documents?
from sklearn.feature_extraction.text import TfidfVectorizer

# Example documents
documents = [
    "Natural language processing is amazing.",
    "Machine learning is a subfield of artificial intelligence.",
    "Text processing includes tokenization, lemmatization, and stemming."
]

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit and transform the documents into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the result into an array and display it
print(tfidf_matrix.toarray())


In [None]:

# How can you apply tokenization, stopword removal, and stemming in one go?
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')

# Example sentence
sentence = "This is an example sentence with stopwords."

# Tokenize the sentence
words = word_tokenize(sentence)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]

# Apply stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]

print(stemmed_words)



In [None]:
# How can you visualize the frequency distribution of words in a sentence?
import matplotlib.pyplot as plt
from nltk import FreqDist
from nltk.tokenize import word_tokenize

# Example sentence
sentence = "Natural language processing is amazing. NLP is fun!"

# Tokenize the sentence
words = word_tokenize(sentence)

# Create a frequency distribution
freq_dist = FreqDist(words)

# Plot the frequency distribution
freq_dist.plot()
plt.show()
