<a href="https://colab.research.google.com/github/dhawan98/AI_ExplorerHub/blob/main/NLP_from_scratch_HF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# 🧱 NLP From Scratch — Next‑Word Model on a Real Hugging Face Dataset

This notebook **starts from scratch** and trains a tiny next‑word language model on a **real dataset from the Hugging Face Hub** (e.g., `ag_news`, `imdb`, or `wikitext-2-raw-v1`).  
It replaces any hard‑coded paragraphs or CSVs.

> **Notes**
> - The first cell installs `datasets` if needed. You need internet access to download datasets.
> - Default dataset is **`ag_news`**; switch it in the **Config** cell.
> - For quick runs, the dataset is **subsampled**; increase `MAX_ROWS` to train longer.
> - Model is a small LSTM baseline meant to be simple & fast; you can scale it up later.


In [1]:

# %%capture
# If running locally/Colab and you don't have these installed, uncomment:
# !pip install -q datasets tensorflow==2.*


In [2]:

# ============ Config ============
DATASET_NAME = "ag_news"          # examples: "ag_news", "imdb", "wikitext", "wikitext-2-raw-v1"
SPLIT = "train"                   # which split to train on initially; eval is built from a random 20%
TEXT_FIELDS_PRIORITY = [          # fields to search for text in the loaded dataset (in order)
    "text", "content", "review", "body", "document", "article", "sentence", "question", "title", "description"
]
MAX_ROWS = 10_000                 # subsample for speed; set None to use full split
APPLY_STRIP_FIRST_PARAGRAPH = False  # usually not needed for news/reviews
RANDOM_SEED = 42
EMBED_DIM = 64
LSTM_UNITS = 150
EPOCHS = 20
BATCH_SIZE = 64
TOP_K = 5                         # for next-word suggestions
MIN_SEQ_LEN = 3                   # skip rows that tokenize shorter than this


In [3]:

import re, random, math
import numpy as np
import pandas as pd
from datasets import load_dataset

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

def normalize_whitespace(s: str) -> str:
    if not isinstance(s, str):
        return ""
    return re.sub(r"\s+", " ", s).strip()

def strip_first_paragraph(text: str) -> str:
    if not isinstance(text, str):
        return ""
    parts = re.split(r"\n\s*\n", text.strip(), maxsplit=1)
    return parts[1] if len(parts) > 1 else text

print(tf.__version__)


2.19.0


In [4]:

# ============ Load dataset from the HF Hub ============
ds = load_dataset(DATASET_NAME, split=SPLIT)

# Convert to DataFrame for convenience
df = pd.DataFrame(ds)

# Find a usable text field
text_col = None
for c in TEXT_FIELDS_PRIORITY:
    if c in df.columns:
        text_col = c
        break

# Common composite fields for some datasets
if text_col is None:
    if {"title", "description"}.issubset(df.columns):
        df["text"] = (df["title"].fillna("") + " " + df["description"].fillna("")).map(normalize_whitespace)
        text_col = "text"
    elif {"question", "answer"}.issubset(df.columns):
        df["text"] = (df["question"].fillna("") + " " + df["answer"].fillna("")).map(normalize_whitespace)
        text_col = "text"
    else:
        # Last resort: try the first string-like column
        for c in df.columns:
            if df[c].dtype == object:
                text_col = c
                break

if text_col is None:
    raise ValueError("Could not find a usable text field. Try changing TEXT_FIELDS_PRIORITY or pick another dataset.")

# Keep only needed column
df = df[[text_col]].rename(columns={text_col: "text"})
df["text"] = df["text"].fillna("").map(normalize_whitespace)

if APPLY_STRIP_FIRST_PARAGRAPH:
    df["text"] = df["text"].map(strip_first_paragraph)

if MAX_ROWS is not None:
    df = df.sample(n=min(MAX_ROWS, len(df)), random_state=RANDOM_SEED).reset_index(drop=True)

print("Dataset:", DATASET_NAME, "| Split:", SPLIT, "| Text column:", "text")
print("Rows:", len(df))
df.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset: ag_news | Split: train | Text column: text
Rows: 10000


Unnamed: 0,text
0,"BBC set for major shake-up, claims newspaper L..."
1,Marsh averts cash crunch Embattled insurance b...
2,"Jeter, Yankees Look to Take Control (AP) AP - ..."
3,Flying the Sun to Safety When the Genesis caps...
4,Stocks Seen Flat as Nortel and Oil Weigh NEW Y...


In [5]:

# ============ Train/Test split (holdout) ============
texts = df.loc[df["text"].str.len() > 0, "text"].tolist()
train_texts, test_texts = train_test_split(texts, test_size=0.2, random_state=RANDOM_SEED)
len(train_texts), len(test_texts)


(8000, 2000)

In [6]:

# ============ Tokenizer (fit on train only) ============
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_seqs = tokenizer.texts_to_sequences(train_texts)
test_seqs  = tokenizer.texts_to_sequences(test_texts)

def build_ngram_sequences(sequences, min_len=MIN_SEQ_LEN):
    out = []
    for seq in sequences:
        if len(seq) < min_len:
            continue
        for i in range(1, len(seq)):
            out.append(seq[:i+1])
    return out

train_ngrams = build_ngram_sequences(train_seqs)
test_ngrams  = build_ngram_sequences(test_seqs)

if not train_ngrams:
    raise ValueError("No training n-grams produced. Increase data size or pick a different dataset.")

max_len = max(len(s) for s in train_ngrams)
X_train = pad_sequences(train_ngrams, maxlen=max_len, padding="pre")[:, :-1]
y_train = pad_sequences(train_ngrams, maxlen=max_len, padding="pre")[:, -1]

if test_ngrams:
    X_test = pad_sequences(test_ngrams, maxlen=max_len, padding="pre")[:, :-1]
    y_test = pad_sequences(test_ngrams, maxlen=max_len, padding="pre")[:, -1]
else:
    X_test = np.zeros((0, max_len-1), dtype=np.int32)
    y_test = np.zeros((0,), dtype=np.int32)

vocab_size = len(tokenizer.word_index) + 1
max_len, vocab_size, X_train.shape, X_test.shape


(181, 23109, (306311, 180), (77411, 180))

In [7]:

# ============ Define tiny LSTM next-word model ============
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=EMBED_DIM, input_length=X_train.shape[1]),
    LSTM(LSTM_UNITS),
    Dense(vocab_size, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()




In [None]:

# ============ Train (quick demo) ============
if X_train.shape[0] > 0:
    history = model.fit(
        X_train, y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.1,
        verbose=1
    )
else:
    print("Not enough training data; adjust MAX_ROWS or dataset choice.")


Epoch 1/20
[1m4308/4308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 17ms/step - accuracy: 0.0583 - loss: 7.7749 - val_accuracy: 0.1061 - val_loss: 7.0520
Epoch 2/20
[1m4308/4308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 18ms/step - accuracy: 0.1144 - loss: 6.5843 - val_accuracy: 0.1279 - val_loss: 6.7815
Epoch 3/20
[1m4308/4308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 18ms/step - accuracy: 0.1424 - loss: 5.9837 - val_accuracy: 0.1380 - val_loss: 6.7491
Epoch 4/20
[1m4308/4308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 18ms/step - accuracy: 0.1613 - loss: 5.5002 - val_accuracy: 0.1408 - val_loss: 6.7963
Epoch 5/20
[1m4308/4308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 18ms/step - accuracy: 0.1821 - loss: 5.0783 - val_accuracy: 0.1457 - val_loss: 6.8957
Epoch 6/20
[1m4308/4308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 17ms/step - accuracy: 0.2094 - loss: 4.7174 - val_accuracy: 0.1477 - val_loss: 7.0085
Epoc

In [None]:

# ============ Evaluate on test set ============
if X_test.shape[0] > 0:
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    print({"test_loss": float(test_loss), "test_accuracy": float(test_acc)})
else:
    print("No test sequences available — increase MAX_ROWS or choose a longer-text dataset.")


In [None]:

# ============ Inference helper ============
index_word = {v:k for k,v in tokenizer.word_index.items()}

def predict_next_word(prompt: str, top_k: int = TOP_K):
    seq = tokenizer.texts_to_sequences([prompt])[0]
    if not seq:
        return []
    seq = pad_sequences([seq], maxlen=X_train.shape[1], padding="pre")
    probs = model.predict(seq, verbose=0)[0]
    top_idx = np.argsort(probs)[-top_k:][::-1]
    return [(index_word.get(int(i), "<UNK>"), float(probs[i])) for i in top_idx]

# Demo:
print(predict_next_word("the stock market"))


In [None]:

# ============ Save artifacts ============
import json, os, pickle

OUT_DIR = "artifacts"
os.makedirs(OUT_DIR, exist_ok=True)

# Save tokenizer
with open(os.path.join(OUT_DIR, "tokenizer_word_index.json"), "w") as f:
    json.dump(tokenizer.word_index, f)

# Save model
model.save(os.path.join(OUT_DIR, "next_word_lstm.h5"))
print("Saved:", os.listdir(OUT_DIR))
