# Snorkel on IMDb

In [None]:
%pip install snorkel datasets scikit-learn pandas

## 1. Load and Explore the IMDb Dataset

In [None]:
from datasets import load_dataset
import pandas as pd

# Load 2000 training and 500 test examples for speed​

imdb = load_dataset("imdb")
train = pd.DataFrame(imdb["train"].select(range(2000)))
test = pd.DataFrame(imdb["test"].select(range(500)))
print("Train size:", len(train), "Test size:", len(test))
train.head()

## 2. Preprocess Text

In [None]:
import re
def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^\w\s']", "", text)
    return text.lower()

train["text"] = train["text"].apply(clean_text)
test["text"] = test["text"].apply(clean_text)

## 3. Define Labeling Functions (LFs)

Create simple heuristics:

+ **LF_positive**: labels text as positive if it contains strong positive words.​​
+ **LF_negative**: labels text as negative if it contains strong negative words
+ **LF_exclaim**: positive if contains “!” more than 2 times

In [None]:
from snorkel.labeling import labeling_function, LFAnalysis
from snorkel.labeling import LabelModel

ABSTAIN, NEG, POS = -1, 0, 1
positive_words = {"great","excellent","amazing","wonderful","best","fantastic"}
negative_words = {"bad","terrible","awful","worst","boring","poor"}

@labeling_function()
def lf_positive(x):
    return POS if any(w in x.text.split() for w in positive_words) else ABSTAIN

@labeling_function()
def lf_negative(x):
    return NEG if any(w in x.text.split() for w in negative_words) else ABSTAIN

@labeling_function()
def lf_exclaim(x):
    return POS if x.text.count("!") > 2 else ABSTAIN
lfs = [lf_positive, lf_negative, lf_exclaim]

#### Analyze LF Coverage & Conflicts

In [None]:
from snorkel.labeling import PandasLFApplier
applier = PandasLFApplier(lfs)
L_train = applier.apply(train)
LFAnalysis(L_train, lfs).lf_summary()

## 4. Train the LabelModel

In [None]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)

# Get probabilistic labels​
train_probs = label_model.predict_proba(L_train)
train_preds = label_model.predict(L_train)

## 5. Train an End-to-End Classifier

Use a simple logistic regression on TF-IDF features:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Vectorize​

vectorizer = TfidfVectorizer(max_features=5_000)
X_train = vectorizer.fit_transform(train["text"])
y_train = train_preds

# Fit classifier​

clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Evaluate on test set​

X_test = vectorizer.transform(test["text"])
y_test = test["label"]
preds = clf.predict(X_test)
print(classification_report(y_test, preds, target_names=["neg","pos"]))

## 6. Evaluate Weak Supervision vs. Fully Supervised

For comparision, train the same classifier on 2,000 true labels

In [None]:
clf_fs = LogisticRegression(max_iter=200)
clf_fs.fit(X_train, train["label"])
fs_preds = clf_fs.predict(X_test)
print("Fully supervised performance:")
print(classification_report(y_test, fs_preds, target_names=["neg","pos"]))

+ Visualize confusion matrices and coverage-accuracy trade-off curves