In [1]:
import json
import time
import cohere
import numpy as np
import pandas as pd
from tqdm.auto import tqdm, trange
from sklearn.metrics import pairwise
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from snorkel.labeling.model import LabelModel

In [2]:
# with open("./data/access_tokens.json") as f:
#     cohere_key = json.load(f)["cohere"]["api_key"]

In [3]:
# cohere_client = cohere.Client(cohere_key)

In [4]:
# train = pd.read_csv("./data/train_data.csv", encoding="utf-8")[["statement"]]
# train.head()

In [5]:
# train_text = train.statement.to_list()

In [6]:
# valid = pd.read_csv("./data/valid_data.csv", encoding="utf-8")[["statement"]]
# valid.head()

In [7]:
# valid_text = valid.statement.to_list()

In [8]:
# def embed_text(texts, batch_size=100, model_name="embed-english-v2.0"):
#     embeds = []
#     for i in trange(0, len(texts), batch_size):
#         batch = texts[i : i + batch_size]
#         response = cohere_client.embed(texts=batch, model=model_name)
#         embeds.extend(response.embeddings)
#         time.sleep(30)
#     return embeds

In [9]:
# batch_size = 200
# cohere_model_name = "embed-english-v2.0"
# train_features = embed_text(train_text, batch_size, cohere_model_name)
# valid_features = embed_text(valid_text, batch_size, cohere_model_name)

In [10]:
# train_features = np.array(train_features)
# valid_features = np.array(valid_features)

In [11]:
# np.save("./data/train_features.npy", train_features)
# np.save("./data/valid_features.npy", valid_features)

In [12]:
# loading original labeling functions matricies
L_train = np.load("./data/L_train.npy")
L_valid = np.load("./data/L_valid.npy")

In [13]:
# loading the llama-2-7b features
train_features = np.load("./data/train_features.npy")
valid_features = np.load("./data/valid_features.npy")

In [14]:
train_features.shape, valid_features.shape

((12170, 4096), (3042, 4096))

In [15]:
num_lfs = L_train.shape[-1]
thresholds = [0.85] * num_lfs
print(f"Labeling Functions Thresholds:\n{thresholds}")

Labeling Functions Thresholds:
[0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85]


In [16]:
# defining the liger labeling functions expansion object


class Liger:
    def __init__(self):
        pass

    def expand_lfs(self, L_train, L_mat, train_embs, mat_embs, thresholds):
        m = L_mat.shape[1]
        expanded_L_mat = np.copy(L_mat)

        dist_from_mat_to_train = pairwise.cosine_similarity(mat_embs, train_embs)

        train_support_pos = [
            np.argwhere(L_train[:, i] == 1).flatten() for i in range(m)
        ]
        train_support_neg = [
            np.argwhere(L_train[:, i] == 0).flatten() for i in range(m)
        ]

        mat_abstains = [np.argwhere(L_mat[:, i] == -1).flatten() for i in range(m)]

        pos_dists = [
            dist_from_mat_to_train[mat_abstains[i]][:, train_support_pos[i]]
            for i in range(m)
        ]
        neg_dists = [
            dist_from_mat_to_train[mat_abstains[i]][:, train_support_neg[i]]
            for i in range(m)
        ]

        closest_pos = [
            np.max(pos_dists[i], axis=1)
            if pos_dists[i].shape[1] > 0
            else np.full(mat_abstains[i].shape, -1) # what is that?
            for i in range(m)
        ]
        closest_neg = [
            np.max(neg_dists[i], axis=1)
            if neg_dists[i].shape[1] > 0
            else np.full(mat_abstains[i].shape, -1) # what is that?
            for i in range(m)
        ]
        new_pos = [
            (closest_pos[i] > closest_neg[i]) & (closest_pos[i] > thresholds[i])
            for i in range(m)
        ]
        new_neg = [
            (closest_neg[i] > closest_pos[i]) & (closest_neg[i] > thresholds[i])
            for i in range(m)
        ]

        for i in range(m):
            expanded_L_mat[mat_abstains[i][new_pos[i]], i] = 1
            expanded_L_mat[mat_abstains[i][new_neg[i]], i] = 0
        return expanded_L_mat

In [17]:
liger = Liger()

L_train_expanded = liger.expand_lfs(
    L_train, L_train, train_features, train_features, thresholds
)

L_valid_expanded = liger.expand_lfs(
    L_train, L_valid, train_features, valid_features, thresholds
)

In [18]:
np.save("./data/L_train_expanded.npy", L_train_expanded)
np.save("./data/L_valid_expanded.npy", L_valid_expanded)