In [None]:
# 1. How do you perform word tokenization using NLTK and plot a word frequency distribution?
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Sample text
text = "This is a simple text for tokenization and word frequency analysis."

# Tokenize the text
tokens = word_tokenize(text)

# Generate frequency distribution
fdist = FreqDist(tokens)

# Plot the frequency distribution
fdist.plot()
plt.show()


In [None]:
# 2. How do you use SpaCy for dependency parsing of a sentence?
import spacy

# Load SpaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Process the sentence
doc = nlp(sentence)

# Dependency parsing
for token in doc:
    print(token.text, token.dep_, token.head.text)


In [None]:
# 3. How do you use TextBlob for performing text classification based on polarity?
from textblob import TextBlob

# Example text
text = "I love programming, it's so much fun!"

# Create a TextBlob object
blob = TextBlob(text)

# Get polarity
polarity = blob.sentiment.polarity
print("Polarity:", polarity)

# Simple classification based on polarity
if polarity > 0:
    print("Positive sentiment")
elif polarity < 0:
    print("Negative sentiment")
else:
    print("Neutral sentiment")


In [None]:
# 4. How do you extract named entities from a text using SpaCy?
import spacy

# Load SpaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "Apple is looking to buy a startup in the UK for $1 billion."

# Process the text
doc = nlp(text)

# Extract and print named entities
for ent in doc.ents:
    print(ent.text, ent.label_)


In [None]:
# 5. How can you calculate TF-IDF scores for a given text using Scikit-learn?
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one."
]

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)

# Display TF-IDF matrix
print(X.toarray())

# Display feature names (words corresponding to columns)
print(vectorizer.get_feature_names_out())


In [None]:
# 1. How do you create a custom text classifier using NLTK's Naive Bayes classifier?
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize

# Example dataset: List of tuples with text and label
train_data = [
    ("I love programming", "positive"),
    ("This is a great book", "positive"),
    ("I hate this movie", "negative"),
    ("The weather is bad today", "negative")
]

# Feature extraction function
def extract_features(text):
    words = word_tokenize(text)
    return {word: True for word in words}

# Transform data
train_set = [(extract_features(text), label) for text, label in train_data]

# Train Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Test the classifier
test_sentence = "I love this book"
print(classifier.classify(extract_features(test_sentence)))


In [None]:
# 2. How do you use a pre-trained model from Hugging Face for text classification?
from transformers import pipeline

# Load pre-trained model and tokenizer from Hugging Face
classifier = pipeline("sentiment-analysis")

# Test the model
text = "I absolutely love this product!"
result = classifier(text)
print(result)


In [None]:
# 3. How do you perform text summarization using Hugging Face transformers?
from transformers import pipeline

# Load pre-trained summarization model
summarizer = pipeline("summarization")

# Example long text to summarize
text = """
The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. 
The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.
"""

# Generate summary
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
print(summary)


In [None]:
# 4. How can you create a simple RNN for text classification using Keras?
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Embedding, GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Example text data
texts = ["I love programming", "This is a great book", "I hate this movie", "The weather is bad today"]
labels = [1, 1, 0, 0]  # 1 for positive, 0 for negative

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, padding='post')

# Build RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=8, input_length=X.shape[1]))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, labels, epochs=5)


In [None]:
# 5. How do you train a Bidirectional LSTM for text classification?
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Example text data
texts = ["I love programming", "This is a great book", "I hate this movie", "The weather is bad today"]
labels = [1, 1, 0, 0]  # 1 for positive, 0 for negative

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, padding='post')

# Build Bidirectional LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=8, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, labels, epochs=5)


In [None]:

# 6. How do you implement GRU (Gated Recurrent Unit) for text classification?
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Example text data
texts = ["I love programming", "This is a great book", "I hate this movie", "The weather is bad today"]
labels = [1, 1, 0, 0]  # 1 for positive, 0 for negative

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, padding='post')

# Build GRU model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=8, input_length=X.shape[1]))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, labels, epochs=5)


In [None]:
# 7. How do you implement a text generation model using LSTM with Keras?
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation
import numpy as np

# Example text data for text generation (simple)
text = "hello world this is a simple text generation model"
chars = sorted(list(set(text)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
idx_to_char = {idx: ch for ch, idx in char_to_idx.items()}

# Prepare input-output pairs
seq_length = 5
sequences = []
next_chars = []
for i in range(len(text) - seq_length):
    sequences.append(text[i:i+seq_length])
    next_chars.append(text[i+seq_length])

X = np.zeros((len(sequences), seq_length, len(chars)), dtype=np.bool)
y = np.zeros((len(sequences), len(chars)), dtype=np.bool)

for i, seq in enumerate(sequences):
    for t, char in enumerate(seq):
        X[i, t, char_to_idx[char]] = 1
    y[i, char_to_idx[next_chars[i]]] = 1

# Build LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model
model.fit(X, y, epochs=50)
