In [9]:
import numpy as np
from tqdm.auto import tqdm
from snorkel.labeling.model import LabelModel
from sklearn.metrics import recall_score, classification_report

In [10]:
# loading the matricies
L_train = np.load("./data/L_train.npy")
L_valid = np.load("./data/L_valid.npy")
Y_valid = np.load("./data/Y_valid.npy")

In [11]:
L_valid.shape, Y_valid.shape

((3042, 7), (3042,))

In [12]:
num_labels = len(np.unique(Y_valid))

In [13]:
best_score = 0
best_model = None
l2_values = np.arange(0.0, 0.1, 0.01)

for l2 in tqdm(l2_values, total=len(l2_values)):
    label_model = LabelModel(cardinality=num_labels, verbose=False, device="cuda")
    label_model.fit(
        L_train=L_train,
        n_epochs=2000,
        lr_scheduler="linear",
        lr_scheduler_config={"warmup_percentage": 0.1, "warmup_unit": "epochs"},
        optimizer="adam",
        l2=l2,
        class_balance=None,
        progress_bar=False,
        seed=42,
        lr=0.01,
    )
    preds = label_model.predict(L_valid)

    labels = Y_valid[preds != -1]
    preds = preds[preds != -1]
    score = recall_score(labels, preds, average="macro")
    if score >= best_score:
        best_score = score
        best_model = label_model

  0%|          | 0/10 [00:00<?, ?it/s]

In [14]:
best_score

0.7506266145733256

In [15]:
y_pred_train = best_model.predict(L_train)
y_pred_valid = best_model.predict(L_valid)
np.save("./data/snorkel_labels_train.npy", y_pred_train)
np.save("./data/snorkel_labels_valid.npy", y_pred_valid)

In [16]:
y_true = Y_valid.copy()

y_true = y_true[y_pred_valid != -1]
y_pred = y_pred_valid[y_pred_valid != -1]

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.73      0.75      1201
           1       0.73      0.77      0.75      1170

    accuracy                           0.75      2371
   macro avg       0.75      0.75      0.75      2371
weighted avg       0.75      0.75      0.75      2371

