In [None]:
"""
Project: Real-World Sentiment Analysis with DistilBERT
Description: Fine-tunes DistilBERT on IMDB (pos/neg) with emoji + HTML cleanup.
"""

# --- 1. Setup ---
!pip -q install transformers datasets emoji tf-keras

import re
import emoji
import numpy as np
import tensorflow as tf
import tf_keras  # IMPORTANT: use tf_keras optimizer with HF TF models
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# Configuration
MODEL_NAME = "distilbert-base-uncased"
BATCH_SIZE = 16
MAX_LENGTH = 128
LEARNING_RATE = 5e-5
EPOCHS = 2

# --- 2. Preprocessor ---
def clean_text(text: str) -> str:
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r"<.*?>", "", text)
    text = text.strip().lower()
    return text

# --- 3. Load & Prepare Data ---
print("Loading IMDB Dataset...")
dataset = load_dataset("imdb")

train_dataset = dataset["train"].shuffle(seed=42).select(range(2000))
test_dataset  = dataset["test"].shuffle(seed=42).select(range(500))

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    cleaned_texts = [clean_text(t) for t in examples["text"]]
    return tokenizer(
        cleaned_texts,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

print("Tokenizing data...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test  = test_dataset.map(tokenize_function, batched=True)

# --- 4. Convert to TensorFlow datasets ---
# IMPORTANT: label_cols as a STRING to get (inputs, labels) cleanly
tf_train = tokenized_train.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols="label",
    shuffle=True,
    batch_size=BATCH_SIZE,
)

tf_test = tokenized_test.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols="label",
    shuffle=False,
    batch_size=BATCH_SIZE,
)

# Quick sanity check (labels must NOT be None)
x_batch, y_batch = next(iter(tf_train))
print("Batch keys:", x_batch.keys(), "| labels shape:", y_batch.shape)

# --- 5. Model ---
print(f"Downloading {MODEL_NAME}...")
model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    use_safetensors=False,  # helps avoid safetensors edge cases in some TF setups
)

# IMPORTANT: use tf_keras optimizer, and DO NOT pass loss=model.compute_loss
optimizer = tf_keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer, metrics=["accuracy"])

# --- 6. Training ---
print("Starting Fine-Tuning...")
model.fit(tf_train, epochs=EPOCHS, validation_data=tf_test)

# --- 7. Evaluation ---
print("\nEvaluating on Test Set...")
results = model.evaluate(tf_test, return_dict=True)
print({k: float(v) for k, v in results.items()})

# --- 8. Inference Engine ---
def predict_sentiment(text: str) -> str:
    clean = clean_text(text)
    inputs = tokenizer(clean, return_tensors="tf", truncation=True, max_length=MAX_LENGTH)

    outputs = model(inputs)
    logits = outputs.logits
    probs = tf.nn.softmax(logits, axis=-1)[0].numpy()  # [neg, pos]

    neg_score, pos_score = float(probs[0]), float(probs[1])

    if pos_score > 0.6:
        label = "Positive üòä"
        conf = pos_score
    elif pos_score < 0.4:
        label = "Negative üò†"
        conf = neg_score
    else:
        label = "Neutral üòê"
        conf = max(pos_score, neg_score)

    return (
        f"Text: '{text}'\n"
        f"Prediction: {label}\n"
        f"Scores -> Negative: {neg_score:.2%}, Positive: {pos_score:.2%}\n"
        f"Confidence (chosen): {conf:.2%}\n"
    )

# --- 9. Demo ---
print("-" * 30)
print("LIVE DEMO")
print("-" * 30)
print(predict_sentiment("This movie was absolute trash üóëÔ∏è"))
print(predict_sentiment("I literally died laughing, best comedy ever üíÄ"))
print(predict_sentiment("The cinematography was okay, but the plot was boring."))


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m307.2/608.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m604.2/608.4 kB[0m [31m14.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m608.4/608.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hLoading IMDB Dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(‚Ä¶):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing data...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Batch keys: dict_keys(['input_ids', 'attention_mask']) | labels shape: (16,)
Downloading distilbert-base-uncased...


tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-

Starting Fine-Tuning...
Epoch 1/2
Epoch 2/2

Evaluating on Test Set...


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


{'loss': 0.40068215131759644, 'accuracy': 0.8420000076293945}
------------------------------
LIVE DEMO
------------------------------
Text: 'This movie was absolute trash üóëÔ∏è'
Prediction: Negative üò†
Scores -> Negative: 98.86%, Positive: 1.14%
Confidence (chosen): 98.86%

Text: 'I literally died laughing, best comedy ever üíÄ'
Prediction: Positive üòä
Scores -> Negative: 5.56%, Positive: 94.44%
Confidence (chosen): 94.44%

Text: 'The cinematography was okay, but the plot was boring.'
Prediction: Negative üò†
Scores -> Negative: 98.89%, Positive: 1.11%
Confidence (chosen): 98.89%



In [None]:
import json
import os

# 1. Look at the top left of your screen for the name.
# If it says "Untitled0.ipynb", put that here.
actual_filename = "Untitled9.ipynb"  # <--- CHANGE THIS

in_path = f"/content/{Project-Real-World-Sentiment-Analysis-with-DistilBERT}"
out_path = f"/content/{actual_filename.replace('.ipynb', '_github.ipynb')}"

if os.path.exists(in_path):
    with open(in_path, "r", encoding="utf-8") as f:
        nb = json.load(f)

    # Clean metadata
    nb.get("metadata", {}).pop("widgets", None)

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(nb, f, ensure_ascii=False, indent=1)
    print(f"Success! Saved as {out_path}")
else:
    print(f"Error: Could not find '{in_path}'. Check the file name at the top left of Colab!")

In [2]:
import json
import os

# 1. Look at the top left of your screen for the name.
# If it says "Untitled0.ipynb", put that here.
actual_filename = "Untitled9.ipynb"  # <--- CHANGE THIS

in_path = f"/content/{Project-Real-World-Sentiment-Analysis-with-DistilBERT}"
out_path = f"/content/{Project-Real-World-Sentiment-Analysis-with-DistilBERT.replace('.ipynb', '_github.ipynb')}"

if os.path.exists(in_path):
    with open(in_path, "r", encoding="utf-8") as f:
        nb = json.load(f)

    # Clean metadata
    nb.get("metadata", {}).pop("widgets", None)

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(nb, f, ensure_ascii=False, indent=1)
    print(f"Success! Saved as {out_path}")
else:
    print(f"Error: Could not find '{in_path}'. Check the file name at the top left of Colab!")

SyntaxError: f-string: expecting '=', or '!', or ':', or '}' (ipython-input-4290509877.py, line 8)

In [3]:
import json
import os

# Put the notebook filename EXACTLY as shown in Colab (top-left)
actual_filename = "Untitled9.ipynb"  # <- change only if needed

in_path = f"/content/{actual_filename}"
out_path = f"/content/{actual_filename.replace('.ipynb', '_github.ipynb')}"

if os.path.exists(in_path):
    with open(in_path, "r", encoding="utf-8") as f:
        nb = json.load(f)

    # Remove widget metadata that breaks GitHub rendering
    nb.get("metadata", {}).pop("widgets", None)

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(nb, f, ensure_ascii=False, indent=1)

    print(f"Success! Saved as {out_path}")
else:
    print(f"Error: Could not find '{in_path}'. Check the notebook name in Colab.")


Error: Could not find '/content/Untitled9.ipynb'. Check the notebook name in Colab.


In [5]:
import glob
glob.glob("/content/*.ipynb")


[]

In [6]:
from google.colab import files
uploaded = files.upload()  # choose the .ipynb you just downloaded


Saving Untitled9.ipynb to Untitled9.ipynb


In [7]:
import glob
glob.glob("/content/*.ipynb")


['/content/Untitled9.ipynb']

In [8]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [10]:
import json, glob

in_path = glob.glob("/content/drive.ipynb")[0]   # picks the first ipynb found
out_path = in_path.replace(".ipynb", "deshm084/Project-Real-World-Sentiment-Analysis-with-DistilBERT.ipynb")

with open(in_path, "r", encoding="utf-8") as f:
    nb = json.load(f)

nb.get("metadata", {}).pop("widgets", None)

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(nb, f, ensure_ascii=False, indent=1)

print("Saved:", out_path)


IndexError: list index out of range

In [None]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [None]:
import json, glob

in_path = glob.glob("/content/*.ipynb")[0]   # picks the first ipynb found
out_path = in_path.replace(".ipynb", "_github.ipynb")

with open(in_path, "r", encoding="utf-8") as f:
    nb = json.load(f)

nb.get("metadata", {}).pop("widgets", None)

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(nb, f, ensure_ascii=False, indent=1)

print("Saved:", out_path)


Mounted at /content/drive
