# 0) One-time setup (run first)

In [None]:
# Colab setup (uncomment if running fresh)
# !python -m pip install -U pip
!pip install spacy==3.* transformers==4.* tokenizers==0.* sentencepiece==0.* \
               beautifulsoup4==4.* lxml==5.* regex==2024.* unidecode==1.*

# Download a small spaCy model (English)
!python -m spacy download en_core_web_sm


Collecting unidecode==1.*
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m151.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart r

# Classical tokenization (whitespace + simple rules)

In [None]:
import re

text = "I can't believe it's not butter! Visit https://example.org, or email info@example.com."

# Whitespace-based tokens (baseline)
ws_tokens = text.split()

# Basic rule: split punctuation as separate tokens, keep contractions intact
# rule_tokens = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\sA-Za-z0-9]", text)

print("Whitespace:", ws_tokens)
# print("Rule-based:", rule_tokens)


Whitespace: ['I', "can't", 'believe', "it's", 'not', 'butter!', 'Visit', 'https://example.org,', 'or', 'email', 'info@example.com.']


# Compare tokenizers (spaCy, BERT, GPT-2, T5)

In [None]:
import spacy
from transformers import AutoTokenizer

text = "I can’t believe it’s not butter!"  # note: curly apostrophes

# spaCy
nlp = spacy.load("en_core_web_sm")
spacy_toks = [t.text for t in nlp(text)]

# BERT (WordPiece)
bert_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_toks = bert_tok.convert_ids_to_tokens(bert_tok.encode(text, add_special_tokens=False))

# GPT-2 (byte-level BPE)
gpt2_tok = AutoTokenizer.from_pretrained("gpt2")
gpt2_toks = gpt2_tok.convert_ids_to_tokens(gpt2_tok.encode(text, add_special_tokens=False))

# T5 (SentencePiece / Unigram LM)
t5_tok = AutoTokenizer.from_pretrained("t5-small")
t5_toks = t5_tok.convert_ids_to_tokens(t5_tok.encode(text, add_special_tokens=False))

print("spaCy:", spacy_toks)
print("BERT:", bert_toks)
print("GPT-2:", gpt2_toks)
print("T5:", t5_toks)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

spaCy: ['I', 'ca', 'n’t', 'believe', 'it', '’s', 'not', 'butter', '!']
BERT: ['i', 'can', '’', 't', 'believe', 'it', '’', 's', 'not', 'butter', '!']
GPT-2: ['I', 'Ġcan', 'âĢ', 'Ļ', 't', 'Ġbelieve', 'Ġit', 'âĢ', 'Ļ', 's', 'Ġnot', 'Ġbutter', '!']
T5: ['▁I', '▁can', '’', 't', '▁believe', '▁it', '’', 's', '▁not', '▁butter', '!']


In [None]:
bert_toks2 = bert_tok.convert_ids_to_tokens(bert_tok.encode("I just bought my Apple watch!", add_special_tokens=False))
print(bert_toks2)


['i', 'just', 'bought', 'my', 'apple', 'watch', '!']


# Unicode normalization (NFC vs NFKC)

In [None]:
import unicodedata

s1 = "café"                     # composed: 'é' U+00E9
s2 = "cafe\u0301"               # decomposed: 'e' + COMBINING ACUTE

def show_norm(s):
    return {
        "orig": s,
        "NFC": unicodedata.normalize("NFC", s),
        "NFD": unicodedata.normalize("NFD", s),
        "NFKC": unicodedata.normalize("NFKC", s),
        "NFKD": unicodedata.normalize("NFKD", s),
    }

print(show_norm(s1))
print(show_norm(s2))

print("Equality (raw):", s1 == s2)
print("Equality (NFC):", unicodedata.normalize("NFC", s1) == unicodedata.normalize("NFC", s2))


{'orig': 'café', 'NFC': 'café', 'NFD': 'café', 'NFKC': 'café', 'NFKD': 'café'}
{'orig': 'café', 'NFC': 'café', 'NFD': 'café', 'NFKC': 'café', 'NFKD': 'café'}
Equality (raw): False
Equality (NFC): True


# Regex cleaning (remove emails/URLs/HTML)

In [None]:
import re
from bs4 import BeautifulSoup

raw = """
<p>Contact us at <b>info@example.com</b>! Visit https://example.org or <a href="https://example.com">our site</a>.
<div>New iPhones!!! <em>Limited</em> stock.</div>
"""

# Strip HTML tags (BeautifulSoup) → text only
text = BeautifulSoup(raw, "lxml").get_text(separator=" ")

# # Remove emails and URLs
text = re.sub(r"\S+@\S+\.\S+", "", text)            # emails
text = re.sub(r"http[s]?://\S+", "", text)          # urls

# # Collapse extra whitespace
# text = re.sub(r"\s+", " ", text).strip()

print(text)


Contact us at   ! Visit  or  our site .
 New iPhones!!!  Limited  stock. 



# Regex extraction (dates, simple IDs)

In [None]:
import re

doc = "Conference dates: 2024-10-12 and 2025-01-05. Ticket IDs: A-12345, B-987."
dates = re.findall(r"\b\d{4}-\d{2}-\d{2}\b", doc)
ticket_ids = re.findall(r"\b[A-Z]-\d{3,6}\b", doc)

print("Dates:", dates)
print("Ticket IDs:", ticket_ids)


Dates: ['2024-10-12', '2025-01-05']
Ticket IDs: ['A-12345', 'B-987']


Stopwords & punctuation choices (spaCy)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

sent = "This is really not great!!! Honestly, I’m shocked."

doc = nlp(sent)
keep_all = [t.text for t in doc]
no_stop = [t.text for t in doc if not t.is_stop]
no_punct = [t.text for t in doc if not t.is_punct]
lemma_lower_no_stop_punct = [t.lemma_.lower() for t in doc if not t.is_stop and not t.is_punct]

print("Original:", keep_all)
print("No stopwords:", no_stop)
print("No punctuation:", no_punct)
print("Lemma+lower, no stop/punct:", lemma_lower_no_stop_punct)


Original: ['This', 'is', 'really', 'not', 'great', '!', '!', '!', 'Honestly', ',', 'I', '’m', 'shocked', '.']
No stopwords: ['great', '!', '!', '!', 'Honestly', ',', 'shocked', '.']
No punctuation: ['This', 'is', 'really', 'not', 'great', 'Honestly', 'I', '’m', 'shocked']
Lemma+lower, no stop/punct: ['great', 'honestly', 'shocked']


# End-to-end preprocessing pipeline (spaCy + regex)

In [None]:
import re, unicodedata, spacy

nlp = spacy.load("en_core_web_sm")

URL_RE = re.compile(r"http\S+")
HTML_RE = re.compile(r"<.*?>", flags=re.S)

def preprocess(text: str):
    # 1) Unicode normalization (NFKC)
    text = unicodedata.normalize("NFKC", text)

    # 2) Remove URLs & HTML
    text = URL_RE.sub("", text)
    text = HTML_RE.sub("", text)

    # 3) spaCy processing
    doc = nlp(text)

    # 4) Normalize: case-fold + lemmatize, filter stop/punct/space
    out = []
    for tok in doc:
        if tok.is_stop or tok.is_punct or tok.is_space:
            continue
        lemma = tok.lemma_.lower().strip()
        # if lemma: // may be necessary for large document
        out.append(lemma)
    return out

sample = "Apple is releasing the new iPhone!!! Visit https://apple.com for details."
print(preprocess(sample))
# Expected-style output: ['apple', 'release', 'new', 'iphone', 'visit', 'detail']


['apple', 'release', 'new', 'iphone', 'visit', 'detail']


In [None]:
# Tiny helper to show each pipeline stage
def show_stages(text: str):
    import re, unicodedata, spacy
    nlp = spacy.load("en_core_web_sm")
    URL_RE = re.compile(r"http\S+")
    HTML_RE = re.compile(r"<.*?>", flags=re.S)

    print("RAW:", text)
    t1 = unicodedata.normalize("NFKC", text)
    print("NFKC:", t1)
    t2 = URL_RE.sub("", HTML_RE.sub("", t1))
    print("CLEAN:", t2)
    doc = nlp(t2)
    print("TOKENS:", [t.text for t in doc])
    lemmas = [t.lemma_.lower() for t in doc if not (t.is_stop or t.is_punct or t.is_space)]
    print("LEMMA+FILTERED:", lemmas)

show_stages("Apple is releasing the new iPhone!!! Visit https://apple.com for details.")


RAW: Apple is releasing the new iPhone!!! Visit https://apple.com for details.
NFKC: Apple is releasing the new iPhone!!! Visit https://apple.com for details.
CLEAN: Apple is releasing the new iPhone!!! Visit  for details.
TOKENS: ['Apple', 'is', 'releasing', 'the', 'new', 'iPhone', '!', '!', '!', 'Visit', ' ', 'for', 'details', '.']
LEMMA+FILTERED: ['apple', 'release', 'new', 'iphone', 'visit', 'detail']


In [None]:
import re, unicodedata, spacy
nlp = spacy.load("en_core_web_sm")

URL_RE = re.compile(r"http\S+")
def preprocess(t):
    t = unicodedata.normalize("NFKC", t)
    t = URL_RE.sub("<URL>", t)
    doc = nlp(t)
    return [tok.lemma_.lower() for tok in doc
            if not (tok.is_stop or tok.is_punct or tok.is_space)]

print(preprocess("Apple is releasing the new iPhone!!! Visit https://apple.com"))
# → ['apple', 'release', 'new', 'iphone', 'visit', '<url>']


['apple', 'release', 'new', 'iphone', 'visit', '<', 'url', '>']
