In [1]:
# If needed (first run):
# !pip install -q datasets

# ===== 1) Load and prepare text =====
from datasets import load_dataset
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Load a small quotes dataset
ds = load_dataset("Abirate/english_quotes")
quotes = ds["train"]["quote"]

# (Optional) you can filter very short quotes to reduce noise
quotes = [q.strip() for q in quotes if isinstance(q, str) and len(q.split()) >= 3]

# ===== 2) Tokenize and build n-gram training sequences =====
# Cap the vocab to keep the model small; increase if you like
VOCAB_CAP = 20000
tokenizer = Tokenizer(num_words=VOCAB_CAP, oov_token="<OOV>")
tokenizer.fit_on_texts(quotes)

sequences = []
for q in quotes:
    token_list = tokenizer.texts_to_sequences([q])[0]
    # build n-grams: [w1,w2] -> label w3 ; [w1,w2,w3] -> label w4 ; ...
    for i in range(1, len(token_list)):
        ngram = token_list[: i + 1]  # includes the label at the end
        sequences.append(ngram)

# Pad to the longest sequence length
max_seq_len = max(len(s) for s in sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_len, padding="pre")

# Inputs are all tokens except last; labels are last token
X = sequences[:, :-1]
y = sequences[:, -1]  # integers
vocab_size = min(VOCAB_CAP, len(tokenizer.word_index) + 1)

print(f"Num sequences: {len(sequences):,} | Max seq len: {max_seq_len} | Vocab size: {vocab_size:,}")

# ===== 3) Build & train the LSTM next-word model =====
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_len - 1),
    LSTM(256),
    Dense(vocab_size, activation="softmax"),
])

model.compile(
    loss="sparse_categorical_crossentropy",  # y is integer class id
    optimizer="adam",
    metrics=["sparse_categorical_accuracy"],
)

callbacks = [
    EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, verbose=1),
]

history = model.fit(
    X, y,
    validation_split=0.1,
    epochs=5,            # bump up to 20+ for better results
    batch_size=256,
    shuffle=True,
    callbacks=callbacks,
    verbose=1
)

# ===== 4) Text generation with temperature sampling =====
index_to_word = tokenizer.index_word  # maps token id -> word

def sample_from_probs(probs, temperature: float = 1.0, rng: np.random.Generator | None = None):
    """Sample an index from a probability vector with temperature."""
    if temperature <= 0:
        return int(np.argmax(probs))
    logits = np.log(np.maximum(probs, 1e-9)) / float(temperature)
    exp = np.exp(logits - np.max(logits))
    p = exp / np.sum(exp)
    rng = rng or np.random.default_rng()
    return int(rng.choice(len(p), p=p))

def generate_text(seed_text: str, num_words: int, temperature: float = 0.8):
    """
    Greedy/temperature sampling next-word generation.
    - seed_text: starting prompt
    - num_words: how many words to add
    - temperature: lower = greedier, higher = more random (e.g. 0.7–1.2)
    """
    text = seed_text.strip()
    for _ in range(num_words):
        # encode and pad
        seq = tokenizer.texts_to_sequences([text])[0]
        if not seq:
            # if all words OOV, start fresh with OOV token
            seq = [tokenizer.word_index.get("<OOV>", 1)]
        seq = pad_sequences([seq], maxlen=max_seq_len - 1, padding="pre")

        # predict next-token distribution
        preds = model.predict(seq, verbose=0)[0]  # (vocab_size,)
        next_id = sample_from_probs(preds, temperature=temperature)

        # map id->word; if missing, skip
        word = index_to_word.get(next_id, "")
        if not word:
            # fallback to argmax if sampled OOV or unknown index
            next_id = int(np.argmax(preds))
            word = index_to_word.get(next_id, "")
            if not word:
                break
        text += " " + word
    return text

# ===== 5) Demo generation =====
print("\n--- Samples ---")
print(generate_text("Once upon a time", 20, temperature=0.8))
print(generate_text("The meaning of life is", 20, temperature=0.9))
print(generate_text("Happiness comes from", 20, temperature=0.7))


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 2508/2508 [00:00<00:00, 338350.42 examples/s]


Num sequences: 77,981 | Max seq len: 750 | Vocab size: 8,109
Epoch 1/5




[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 1s/step - loss: 6.6885 - sparse_categorical_accuracy: 0.0393 - val_loss: 6.5493 - val_sparse_categorical_accuracy: 0.0403 - learning_rate: 0.0010
Epoch 2/5
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 1s/step - loss: 6.2049 - sparse_categorical_accuracy: 0.0618 - val_loss: 6.4125 - val_sparse_categorical_accuracy: 0.0736 - learning_rate: 0.0010
Epoch 3/5
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 1s/step - loss: 5.9568 - sparse_categorical_accuracy: 0.0834 - val_loss: 6.2936 - val_sparse_categorical_accuracy: 0.0913 - learning_rate: 0.0010
Epoch 4/5
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 1s/step - loss: 5.7315 - sparse_categorical_accuracy: 0.1028 - val_loss: 6.2472 - val_sparse_categorical_accuracy: 0.1032 - learning_rate: 0.0010
Epoch 5/5
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 1s/step - loss: 5.5412 - sparse_cate

In [15]:
import textwrap

def generate_text(seed_text: str, num_words: int, temperature: float = 0.8, wrap_width: int = 80):
    """
    Greedy/temperature sampling next-word generation.
    - seed_text: starting prompt
    - num_words: how many words to add
    - temperature: lower = greedier, higher = more random (e.g. 0.7–1.2)
    - wrap_width: max characters per line for output formatting
    """
    text = seed_text.strip()
    for _ in range(num_words):
        # encode and pad
        seq = tokenizer.texts_to_sequences([text])[0]
        if not seq:
            seq = [tokenizer.word_index.get("<OOV>", 1)]
        seq = pad_sequences([seq], maxlen=max_seq_len - 1, padding="pre")

        # predict next-token distribution
        preds = model.predict(seq, verbose=0)[0]
        next_id = sample_from_probs(preds, temperature=temperature)

        word = index_to_word.get(next_id, "")
        if not word:
            next_id = int(np.argmax(preds))
            word = index_to_word.get(next_id, "")
            if not word:
                break
        text += " " + word

    # Format output
    print("Starting text:")
    print(textwrap.fill(seed_text.strip(), width=wrap_width))
    print("\nGenerated text:")
    generated_only = text[len(seed_text.strip()):].strip()
    print(textwrap.fill(generated_only, width=wrap_width))
    print("\n")

    return


In [17]:
generate_text("Once upon a time", 50, temperature=0.9)
generate_text("The meaning of life is", 50, temperature=0.8)
generate_text("Happiness comes from", 50, temperature=0.7)

Starting text:
Once upon a time

Generated text:
of so all that does to something jace like you can find are the own minded what
me her sit nor have so right so not as everything we have be mind ” sam ” when
that 'i running down to apollo and much myth was happy ” lies teaches


Starting text:
The meaning of life is

Generated text:
a things will be ” the pursuing if you mean it ” their way you can take my
single day it down in breathlessness how ” joy is the best tasted not can even ”
into a lease or your one natural when taken to wrap the defeats ” the


Starting text:
Happiness comes from

Generated text:
men ” love that i make someone reading i feel our dna ” with losing the sure ”
that nothing is to rather and someone to me if i speak that i say anything to be
half ” ” ” ” in the own way ” i can be no


