In [1]:
from datasets import load_dataset
import pandas as pd

# Load 2000 training and 500 test examples for speed

imdb = load_dataset("imdb")
train = pd.DataFrame(imdb["train"].select(range(2000)))
test  = pd.DataFrame(imdb["test"].select(range(500)))
print("Train size:", len(train), "Test size:", len(test))
train.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train size: 2000 Test size: 500


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [2]:
import re
def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^\w\s']", "", text)
    return text.lower()

train["text"] = train["text"].apply(clean_text)
test["text"]  = test["text"].apply(clean_text)

In [5]:
pip install snorkel

Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading snorkel-0.10.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.10.0


In [6]:
from snorkel.labeling import labeling_function, LFAnalysis
from snorkel.labeling.model import LabelModel

ABSTAIN, NEG, POS = -1, 0, 1
positive_words = {"great","excellent","amazing","wonderful","best","fantastic"}
negative_words = {"bad","terrible","awful","worst","boring","poor"}

@labeling_function()
def lf_positive(x):
    return POS if any(w in x.text.split() for w in positive_words) else ABSTAIN

@labeling_function()
def lf_negative(x):
    return NEG if any(w in x.text.split() for w in negative_words) else ABSTAIN

@labeling_function()
def lf_exclaim(x):
    return POS if x.text.count("!") > 2 else ABSTAIN
lfs = [lf_positive, lf_negative, lf_exclaim]

In [7]:
from snorkel.labeling import PandasLFApplier
applier = PandasLFApplier(lfs)
L_train = applier.apply(train)
LFAnalysis(L_train, lfs).lf_summary()

100%|██████████| 2000/2000 [00:00<00:00, 4079.07it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_positive,0,[1],0.321,0.1795,0.1795
lf_negative,1,[0],0.5635,0.1795,0.1795
lf_exclaim,2,[],0.0,0.0,0.0


In [8]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)

# Get probabilistic labels
train_probs = label_model.predict_proba(L_train)
train_preds = label_model.predict(L_train)

100%|██████████| 500/500 [00:00<00:00, 777.30epoch/s]


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

# Vectorize

vectorizer = TfidfVectorizer(max_features=5_000)
X_train = vectorizer.fit_transform(train["text"])

# Filter out abstained predictions
valid_indices = np.where(train_preds != ABSTAIN)[0]
X_train_filtered = X_train[valid_indices]
y_train_filtered = train_preds[valid_indices]


# Fit classifier

clf = LogisticRegression(max_iter=200)
clf.fit(X_train_filtered, y_train_filtered)

# Evaluate on test set

X_test = vectorizer.transform(test["text"])
y_test = test["label"]
preds = clf.predict(X_test)
print(classification_report(y_test, preds, target_names=["neg","pos"], labels=[NEG, POS]))

              precision    recall  f1-score   support

         neg       1.00      1.00      1.00       500
         pos       0.00      0.00      0.00         0

    accuracy                           1.00       500
   macro avg       0.50      0.50      0.50       500
weighted avg       1.00      1.00      1.00       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
clf_fs = LogisticRegression(max_iter=200)

# Filter training data to include only samples with labels 0 or 1
train_filtered = train[train["label"].isin([0, 1])]

# Check if there is more than one unique class in the filtered training data
if len(train_filtered["label"].unique()) > 1:
    clf_fs.fit(X_train[train_filtered.index], train_filtered["label"])
    fs_preds = clf_fs.predict(X_test)
    print("Fully supervised performance:")
    print(classification_report(y_test, fs_preds, target_names=["neg","pos"]))
else:
    print("Filtered training data contains only one class. Cannot train fully supervised model with this subset.")

Filtered training data contains only one class. Cannot train fully supervised model with this subset.
