<a href="https://colab.research.google.com/github/charansanthosh1675/Home-Assignment-3_charan_santhosh_gudiseva/blob/main/home_assignment_3_charan_santhosh_gudiseva.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# Task 1: RNN for Text Generation
# Build LSTM model to learn character sequences

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

# Sample input text
text = "The Little Prince is a novella by Antoine de Saint-Exupéry."

# Create character mappings
chars = sorted(set(text))
char2idx = {c: i for i, c in enumerate(chars)}
idx2char = {i: c for c, i in char2idx.items()}
vocab_size = len(chars)

# Prepare input sequences and target characters
seq_length = 20
step = 3
X, y = [], []
for i in range(0, len(text) - seq_length, step):
    seq = text[i:i + seq_length]
    target = text[i + seq_length]
    X.append([char2idx[c] for c in seq])
    y.append(char2idx[target])

# One-hot encode inputs and targets
X = to_categorical(X, num_classes=vocab_size)
y = to_categorical(y, num_classes=vocab_size)

# Define LSTM model
model = Sequential([
    LSTM(128, input_shape=(seq_length, vocab_size)),
    Dense(vocab_size, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X, y, batch_size=4, epochs=10)

# Function to sample next character based on temperature
def sample(preds, temperature=1.0):
    preds = np.log(preds + 1e-10) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

# Function to generate text using trained model
def generate_text(seed, length=100, temp=1.0):
    result = seed
    for _ in range(length):
        x_pred = to_categorical([[char2idx.get(c, 0) for c in result[-seq_length:]]], num_classes=vocab_size)
        pred = model.predict(x_pred, verbose=0)[0]
        next_char = idx2char[sample(pred, temp)]
        result += next_char
    return result

# Generate and print new text
print(generate_text("The Little Prince ", 100, temp=0.8))


Epoch 1/10


  super().__init__(**kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - loss: 3.3167
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 3.2449
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 3.1548
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 2.9803
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 2.6825
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 2.2103
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 2.2148
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 2.1979
Epoch 9/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 2.0596
Epoch 10/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 2.0752
The Little Prince   en  -nyiei

In [24]:
# Task 2: NLP Preprocessing
# Tokenize, remove stopwords, apply stemming

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')  # Download English stopwords

# Basic tokenizer using regex (avoids nltk punkt issue)
def simple_tokenize(text):
    return re.findall(r'\b\w+\b', text)

# Preprocessing pipeline function
def preprocess(sentence):
    tokens = simple_tokenize(sentence)  # Step 1: Tokenization
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [word for word in tokens if word.lower() not in stop_words]  # Step 2: Remove stopwords
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in tokens_no_stop]  # Step 3: Stemming
    print("Original Tokens:", tokens)
    print("Without Stopwords:", tokens_no_stop)
    print("Stemmed:", stemmed)

# Run preprocessing on example sentence
preprocess("NLP techniques are used in virtual assistants like Alexa and Siri.")


Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri']
Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri']
Stemmed: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
# Task 3: Named Entity Recognition using SpaCy
# Extract named entities and display their details

import spacy
nlp = spacy.load("en_core_web_sm")  # Load small English model

# Sample sentence for NER
sentence = "Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009."
doc = nlp(sentence)

# Print entity text, type, and position
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Start: {ent.start_char}, End: {ent.end_char}")


Entity: Barack Obama, Label: PERSON, Start: 0, End: 12
Entity: 44th, Label: ORDINAL, Start: 27, End: 31
Entity: the United States, Label: GPE, Start: 45, End: 62
Entity: the Nobel Peace Prize, Label: WORK_OF_ART, Start: 71, End: 92
Entity: 2009, Label: DATE, Start: 96, End: 100


In [26]:
# Task 4: Scaled Dot-Product Attention
# Implement attention mechanism manually

import numpy as np
from scipy.special import softmax

# Function to compute scaled dot-product attention
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]  # Dimension of key
    scores = np.dot(Q, K.T) / np.sqrt(d_k)  # Dot product + scaling
    weights = softmax(scores, axis=1)       # Softmax to get attention weights
    output = np.dot(weights, V)             # Multiply weights with V
    print("Attention Weights:\n", weights)
    print("Output:\n", output)

# Define sample Q, K, V matrices
Q = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
K = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
V = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

# Run attention function
scaled_dot_product_attention(Q, K, V)


Attention Weights:
 [[0.73105858 0.26894142]
 [0.26894142 0.73105858]]
Output:
 [[2.07576569 3.07576569 4.07576569 5.07576569]
 [3.92423431 4.92423431 5.92423431 6.92423431]]


In [27]:
# Task 5: Sentiment Analysis using HuggingFace Transformers
# Load pretrained sentiment classifier and analyze text

from transformers import pipeline

# Load sentiment analysis model
classifier = pipeline("sentiment-analysis")

# Example input sentence
text = "Despite the high price, the performance of the new MacBook is outstanding."

# Run classifier and print result
result = classifier(text)[0]
print(f"Sentiment: {result['label']}")
print(f"Confidence Score: {result['score']:.4f}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Sentiment: POSITIVE
Confidence Score: 0.9998
