# Snorkel on IMDB

## 1. Load and Explore IMDb Dataset

In [None]:
from datasets import load_dataset
import pandas as pd

# Load IMDb dataset
imdb = load_dataset("stanfordnlp/imdb")

# Select subsets for faster experimentation
train = pd.DataFrame(imdb["train"].select(range(2000)))
test = pd.DataFrame(imdb["test"].select(range(500)))

print("Train size:", len(train), "Test size:", len(test))
train.head()

## 2. Preprocess Text

In [None]:
import re

def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^\w\s']", "", text)
    return text.lower()

train["text"] = train["text"].apply(clean_text)
test["text"] = test["text"].apply(clean_text)


## 3. Define Labeling Functions (LFs)

### We’ll define simple heuristics to generate weak labels.

In [None]:
from snorkel.labeling import labeling_function, LFAnalysis, PandasLFApplier, LabelModel

ABSTAIN, NEG, POS = -1, 0, 1

positive_words = {"great", "excellent", "amazing", "wonderful", "best", "fantastic"}
negative_words = {"bad", "terrible", "awful", "worst", "boring", "poor"}

@labeling_function()
def lf_positive(x):
    return POS if any(w in x.text.split() for w in positive_words) else ABSTAIN

@labeling_function()
def lf_negative(x):
    return NEG if any(w in x.text.split() for w in negative_words) else ABSTAIN

@labeling_function()
def lf_exclaim(x):
    return POS if x.text.count("!") > 2 else ABSTAIN

lfs = [lf_positive, lf_negative, lf_exclaim]


### Analyze LF Coverage & Conflicts

In [None]:
applier = PandasLFApplier(lfs)
L_train = applier.apply(train)

LFAnalysis(L_train, lfs).lf_summary()

## 4. Train the Label Model

Snorkel combines LFs into a probabilistic label source.

In [None]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)

# Get weak labels
train_probs = label_model.predict_proba(L_train)
train_preds = label_model.predict(L_train)


## 5. Train End-to-End Classifier on Weak Labels

We’ll use a simple TF-IDF + Logistic Regression classifier.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train["text"])
y_train = train_preds

# Train classifier using weak labels
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Evaluate
X_test = vectorizer.transform(test["text"])
y_test = test["label"]

preds = clf.predict(X_test)
print("Weakly Supervised Performance:")
print(classification_report(y_test, preds, target_names=["neg", "pos"]))


## 6. Compare with Fully Supervised Training

In [None]:
clf_fs = LogisticRegression(max_iter=200)
clf_fs.fit(X_train, train["label"])
fs_preds = clf_fs.predict(X_test)

print("Fully Supervised Performance:")
print(classification_report(y_test, fs_preds, target_names=["neg", "pos"]))

## 7. Visualize Confusion Matrices

In [None]:
# Weak supervision confusion matrix
cm_weak = confusion_matrix(y_test, preds)
# Fully supervised confusion matrix
cm_full = confusion_matrix(y_test, fs_preds)

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
sns.heatmap(cm_weak, annot=True, fmt="d", cmap="Blues", ax=ax[0])
ax[0].set_title("Weakly Supervised")
ax[0].set_xlabel("Predicted")
ax[0].set_ylabel("True")

sns.heatmap(cm_full, annot=True, fmt="d", cmap="Greens", ax=ax[1])
ax[1].set_title("Fully Supervised")
ax[1].set_xlabel("Predicted")
ax[1].set_ylabel("True")

plt.tight_layout()
plt.show()


## Coverage–Accuracy Trade-Off

In [None]:
summary = LFAnalysis(L_train, lfs).lf_summary()
sns.scatterplot(x="Coverage", y="Emp. Acc.", data=summary)
plt.title("LF Coverage vs Accuracy")
plt.show()