# 1. Implementing an RNN for Text Generation

In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import get_file

# 1) Grab the Shakespeare text
url  = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
path = get_file('shakespeare.txt', origin=url)
text = open(path, 'r', encoding='utf-8').read().lower()
print(f'Corpus length: {len(text)} chars')

# 2) Build the char ↔ integer mappings
chars      = sorted(set(text))
vocab_size = len(chars)
char2idx   = {c:i for i, c in enumerate(chars)}
idx2char   = np.array(chars)
text_ids   = np.array([char2idx[c] for c in text])

# 3) Prepare (input, target) sequence pairs
seq_length = 100
ds = tf.data.Dataset.from_tensor_slices(text_ids)
sequences = ds.batch(seq_length+1, drop_remainder=True)

def split_xy(chunk):
    return chunk[:-1], chunk[1:]

dataset = (
    sequences
    .map(split_xy)
    .shuffle(10_000)
    .batch(64, drop_remainder=True)
    .prefetch(tf.data.AUTOTUNE)
)

# 4) Build a stateless LSTM model (no batch_size or stateful)
embedding_dim = 64
rnn_units     = 256

model = Sequential([
    # only specify input_shape, not batch_input_shape
    Embedding(vocab_size, embedding_dim, input_shape=(None,)),
    LSTM(rnn_units, return_sequences=True),
    Dense(vocab_size)
])

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)
model.summary()

# 5) Train
model.fit(dataset, epochs=20)

# 6) Text-generation util (stateless sampling one step at a time)
def sample_with_temperature(probs, temperature=1.0):
    # probs: 1D softmaxed logits
    logits = np.log(probs + 1e-8) / temperature
    exp    = np.exp(logits)
    p      = exp / exp.sum()
    return np.random.choice(len(p), p=p)

def generate_text(start_string, num_chars=500, temperature=1.0):
    # Turn start_string into ID sequence
    input_ids = [char2idx[s] for s in start_string.lower()]
    text_out  = []

    for _ in range(num_chars):
        # feed the **entire** sequence so far:
        input_array = np.expand_dims(input_ids, 0)      # shape (1, len)
        preds = model.predict(input_array, verbose=0)   # (1, len, vocab_size)
        last_preds = preds[0, -1]                       # (vocab_size,)

        next_id = sample_with_temperature(
            tf.nn.softmax(last_preds).numpy(),
            temperature
        )
        # append and continue
        input_ids.append(next_id)
        text_out.append(idx2char[next_id])

    return start_string + ''.join(text_out)

# 7) Try it!
print(generate_text("To be, or not to be: ", temperature=0.7))


Corpus length: 1115394 chars


  super().__init__(**kwargs)


Epoch 1/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 501ms/step - loss: 2.9429
Epoch 2/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 464ms/step - loss: 2.1729
Epoch 3/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 463ms/step - loss: 1.9809
Epoch 4/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 467ms/step - loss: 1.8549
Epoch 5/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 463ms/step - loss: 1.7624
Epoch 6/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 463ms/step - loss: 1.6947
Epoch 7/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 462ms/step - loss: 1.6419
Epoch 8/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 464ms/step - loss: 1.6011
Epoch 9/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 514ms/step - loss: 1.5634
Epoch 10/20
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [9]:
# 7) Try it!
print(generate_text("To be, or not to be: ", temperature=0.7))

To be, or not to be: no lands,
which he call him to be a prething to the duke of roin:
leave him to the sence to longoment,
and find you have sorrow from both a cause.

hastings:
come to your richmost will apperate,
and take it hast she may be hastings
into the caperous art the searing and scarce?

angelo:
i take thy cast to fall of the earth.

gremio:
my lords, thy breath and i see the world
it is more to my strout of such a time that thou makes
which he shall be thine traitor'd home; and i live thee,
then, follow 


# 2. NLP Preprocessing Pipeline

In [None]:
import nltk
import nltk
nltk.download('punkt_tab')

# Download tokenizer and stopwords the first time you run
nltk.download('punkt')
nltk.download('stopwords')

In [13]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def nlp_preprocess(sentence: str):
    # 1. Tokenize
    original_tokens = word_tokenize(sentence)

    # 2. Remove stopwords
    stops = set(stopwords.words('english'))
    tokens_no_stop = [tok for tok in original_tokens if tok.lower() not in stops]

    # 3. Stem
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(tok.lower()) for tok in tokens_no_stop]

    # Print results
    print("Original Tokens:       ", original_tokens)
    print("Tokens Without Stopwords:", tokens_no_stop)
    print("Stemmed Words:         ", stemmed_tokens)

if __name__ == "__main__":
    text = "NLP techniques are used in virtual assistants like Alexa and Siri."
    nlp_preprocess(text)


Original Tokens:        ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.']
Tokens Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri', '.']
Stemmed Words:          ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri', '.']


# 3. Named Entity Recognition with SpaCy

In [14]:
import spacy

def extract_entities(text: str):
    # load the small English model
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # iterate over the detected entities
    for ent in doc.ents:
        print(f"Entity: \"{ent.text}\"")
        print(f"  Label: {ent.label_}")
        print(f"  Start: {ent.start_char}, End: {ent.end_char}")
        print()

if __name__ == "__main__":
    sentence = (
        "Barack Obama served as the 44th President of the United States "
        "and won the Nobel Peace Prize in 2009."
    )
    extract_entities(sentence)

Entity: "Barack Obama"
  Label: PERSON
  Start: 0, End: 12

Entity: "44th"
  Label: ORDINAL
  Start: 27, End: 31

Entity: "the United States"
  Label: GPE
  Start: 45, End: 62

Entity: "the Nobel Peace Prize"
  Label: WORK_OF_ART
  Start: 71, End: 92

Entity: "2009"
  Label: DATE
  Start: 96, End: 100



# 4. Scaled Dot-Product Attention

In [15]:
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    # 1) Dot product Q @ K^T
    scores = Q @ K.T

    # 2) Scale by sqrt(d)
    d = Q.shape[1]
    scaled_scores = scores / np.sqrt(d)

    # 3) Softmax along last axis
    exp_scores = np.exp(scaled_scores - np.max(scaled_scores, axis=1, keepdims=True))
    attention_weights = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    # 4) Weighted sum with V
    output = attention_weights @ V

    return attention_weights, output

# Example inputs
Q = np.array([[1, 0, 1, 0],
              [0, 1, 0, 1]])
K = np.array([[1, 0, 1, 0],
              [0, 1, 0, 1]])
V = np.array([[1,  2,  3,  4],
              [5,  6,  7,  8]])

attn_weights, attn_output = scaled_dot_product_attention(Q, K, V)
print("Attention Weights:\n", attn_weights)
print("\nAttention Output:\n", attn_output)

Attention Weights:
 [[0.73105858 0.26894142]
 [0.26894142 0.73105858]]

Attention Output:
 [[2.07576569 3.07576569 4.07576569 5.07576569]
 [3.92423431 4.92423431 5.92423431 6.92423431]]


# 5.Sentiment Analysis using HuggingFace Transformers

In [16]:
from transformers import pipeline

def analyze_sentiment(text: str):
    # 1) Load the pre-trained sentiment-analysis pipeline
    classifier = pipeline("sentiment-analysis")

    # 2) Analyze the input sentence
    result = classifier(text)[0]

    # 3) Print label and confidence
    label = result["label"]
    score = result["score"]
    print(f"Sentiment: {label}")
    print(f"Confidence Score: {score:.4f}")

if __name__ == "__main__":
    sentence = "Despite the high price, the performance of the new MacBook is outstanding."
    analyze_sentiment(sentence)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Sentiment: POSITIVE
Confidence Score: 0.9998
