In [1]:
import pandas as pd
import random
import numpy as np

In [2]:
df = pd.read_csv('./dataset.csv') # GPT generated dataset
df

Unnamed: 0,text,label
0,The cat is sleeping on the couch.,not a question
1,Are you going to the party tonight?,a question
2,I finished reading a book yesterday.,not a question
3,What is your favorite type of cuisine?,a question
4,She went to the store to buy some milk.,not a question
...,...,...
193,She enjoys practicing meditation and mindfulness.,not a question
194,Do you like to go to the museum?,a question
195,He is a construction worker and builds structu...,not a question
196,What is your favorite type of ice cream flavor?,a question


In [3]:
df.describe(include='all')

Unnamed: 0,text,label
count,198,198
unique,181,2
top,What is your favorite TV show?,not a question
freq,2,100


In [4]:
# Data cleaning, removing duplicates
df = df[~df.duplicated()]
df

Unnamed: 0,text,label
0,The cat is sleeping on the couch.,not a question
1,Are you going to the party tonight?,a question
2,I finished reading a book yesterday.,not a question
3,What is your favorite type of cuisine?,a question
4,She went to the store to buy some milk.,not a question
...,...,...
193,She enjoys practicing meditation and mindfulness.,not a question
194,Do you like to go to the museum?,a question
195,He is a construction worker and builds structu...,not a question
196,What is your favorite type of ice cream flavor?,a question


In [5]:
df = df.rename(columns = {'label': 'true_label'})
df

Unnamed: 0,text,true_label
0,The cat is sleeping on the couch.,not a question
1,Are you going to the party tonight?,a question
2,I finished reading a book yesterday.,not a question
3,What is your favorite type of cuisine?,a question
4,She went to the store to buy some milk.,not a question
...,...,...
193,She enjoys practicing meditation and mindfulness.,not a question
194,Do you like to go to the museum?,a question
195,He is a construction worker and builds structu...,not a question
196,What is your favorite type of ice cream flavor?,a question


In [6]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size= 0.5)

print(df_train.shape)
print(df_test.shape)

(90, 2)
(91, 2)


### Labeling_function class
Takes in an input data point and returns a label for that data point based on some heuristic or other rule.
Used to create labeled training data from large amounts of unlabeled data. By creating many different LabelingFunctions and combining their outputs using a generative model.

### PandasLFApplier class
Used to apply labeling functions (LFs) to a pandas DataFrame. It takes a set 
of LFs and a pandas DataFrame as input, and applies the LFs to each row of the DataFrame to produce label matrices. 
The resulting label matrices can be used to train other models.

Has the ability to automatically parallelize LF application across multiple CPU cores, and the ability to apply LFs to subsets of the DataFrame based on the values of certain columns. It also allows for easy integration with other parts of the Snorkel workflow, such as the LabelModel class for model training and the LabelingFunction class for defining LFs.

### LFAnalysis class
A utility class that provides various analysis functions for a set of labeling functions in a Snorkel project. 

In [7]:
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
import re

# Labels
YES = 1
ABSTAIN = -1

## Creating labeling functions

In [8]:
@labeling_function()
def question_keyword_tokens(x):
    keywords = ['what', 'why', 'when', 'where', 'who', 'how']
    return YES if any(word in x.text.lower() for word in keywords) else ABSTAIN 

In [9]:
@labeling_function()
def question_regex_tokens(x):
    return YES if re.search(r".*?", x.text, flags=re.I) else ABSTAIN

In [10]:
@labeling_function()
def question_regex_are_tokens_(x): 
    return YES if re.search(r"what.*?", x.text.lower(), flags=re.I) else ABSTAIN

## Feed labeling functions into PandasLFApplier

In [11]:
l_functions = [question_keyword_tokens, question_regex_tokens, question_regex_are_tokens_]

applier = PandasLFApplier(lfs = l_functions)

## Create label matrix 

In [12]:
L_train = applier.apply(df=df_train)

100%|█████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 27776.85it/s]


In [13]:
#Print first 5 questions that are labeled with all 3 labeling functions
L_train[:5] 

array([[-1,  1, -1],
       [-1,  1, -1],
       [-1,  1, -1],
       [-1,  1, -1],
       [ 1,  1,  1]])

## Apply label matrix to LFAnalysis

In [14]:
LFAnalysis(L=L_train, lfs = l_functions).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
question_keyword_tokens,0,[1],0.266667,0.266667,0.0
question_regex_tokens,1,[1],1.0,0.266667,0.0
question_regex_are_tokens_,2,[1],0.144444,0.144444,0.0


- `Polarity`: the set of unique labels each labeling function outputs, excluding abstains
- `Coverage`: the fraction of the dataset each labeling function labels
- `Overlaps`: the fraction of the dataset where each labeling function and at least another labeling function label
- `Conflicts`: the fraction of the dataset where each labeling function and at least another labeling function label, and they’re disagree

## Label model
The label model aggregates the labels from labeling functions to produce a final label. 

In [15]:
from snorkel.labeling.model import MajorityLabelVoter, LabelModel

In [16]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train = L_train, n_epochs = 500, log_freq=100, seed=123)


INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                                     | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.561]
INFO:root:[100 epochs]: TRAIN:[loss=0.008]
INFO:root:[200 epochs]: TRAIN:[loss=0.000]
INFO:root:[300 epochs]: TRAIN:[loss=0.000]
INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|█████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 4349.37epoch/s]
INFO:root:Finished Training


In [17]:
df_train['labels'] = label_model.predict(L = L_train, tie_break_policy="abstain")

In [18]:
df_train

Unnamed: 0,text,true_label,labels
38,He likes to cook dinner for his friends.,not a question,-1
180,Do you like to go to the beach?,a question,-1
8,My favorite color is blue.,not a question,-1
187,She likes to go to the movies and watch films.,not a question,-1
63,What is your favorite type of dessert?,a question,1
...,...,...,...
126,The museum has a new exhibit on display.,not a question,-1
33,Would you like to go for a walk in the park?,a question,-1
70,She likes to read books in her free time.,not a question,-1
116,He is a journalist and writes articles for the...,not a question,-1


In [19]:
# To see if the "majority label voter" performed as expected.
print(L_train.shape)
result = L_train.sum(axis = 1)
pos_numbers = [i for i in result if i > 0]
print(f'Number of positive labels: {len(pos_numbers)}')
print(f'Number of negative labels: {len(L_train) - len(pos_numbers)}')
print(f'Number of real negative (ABSTAIN) labels: {len((df_train[df_train.labels == ABSTAIN]))}')

(90, 3)
Number of positive labels: 24
Number of negative labels: 66
Number of real negative (ABSTAIN) labels: 66


In [20]:
df_train[df_train.labels == ABSTAIN]

Unnamed: 0,text,true_label,labels
38,He likes to cook dinner for his friends.,not a question,-1
180,Do you like to go to the beach?,a question,-1
8,My favorite color is blue.,not a question,-1
187,She likes to go to the movies and watch films.,not a question,-1
163,She enjoys taking photos and capturing moments.,not a question,-1
...,...,...,...
126,The museum has a new exhibit on display.,not a question,-1
33,Would you like to go for a walk in the park?,a question,-1
70,She likes to read books in her free time.,not a question,-1
116,He is a journalist and writes articles for the...,not a question,-1


In [21]:
df2_train = df_train.copy()
df2_train['true_label'] = df_train['true_label'].replace(['not a question', 'a question'],[False, True])
df2_train['labels'] = df_train['labels'].replace([-1, 1],[False, True])
display(df2_train)

Unnamed: 0,text,true_label,labels
38,He likes to cook dinner for his friends.,False,False
180,Do you like to go to the beach?,True,False
8,My favorite color is blue.,False,False
187,She likes to go to the movies and watch films.,False,False
63,What is your favorite type of dessert?,True,True
...,...,...,...
126,The museum has a new exhibit on display.,False,False
33,Would you like to go for a walk in the park?,True,False
70,She likes to read books in her free time.,False,False
116,He is a journalist and writes articles for the...,False,False


In [22]:
# Calculate accuracy of pseudolabel to true label in majority vote
true_labels = df2_train['true_label'].to_numpy()
induced_labels = df2_train['labels'].to_numpy()
accuracy = np.mean(np.logical_not(np.logical_xor(true_labels, induced_labels)))

print(f"accuracy to true label: {accuracy}")

accuracy to true label: 0.8


### Observation:
> Changing the labeling function to take regex `r"what.*?"` (with better coverage) instead of `r"are.*"`,

- Before
|                           lf    |  j | Polarity | Coverage  | Overlaps  | Conflicts |
|---------------------------------|----|----------|-----------|-----------|-----------|
|      question_keyword_tokens    |  0 |      [1] |  0.288889 |  0.288889 |       0.0 |
|           question_regex_tokens |  1 |      [1] |  1.000000 |  0.333333 |       0.0 |
| question_regex_are_tokens_      |  2 |      [1] |  0.044444 |  0.044444 |       0.0 |

- After
|                           lf    |  j | Polarity | Coverage  | Overlaps  | Conflicts |
|---------------------------------|----|----------|-----------|-----------|-----------|
|      question_keyword_tokens    |  0 |      [1] |  0.233333 |  0.233333 |       0.0 |
|           question_regex_tokens |  1 |      [1] |  1.000000 |  0.233333 |       0.0 |
| question_regex_are_tokens_      |  2 |      [1] |  0.144444 |  0.144444 |       0.0 |

> Resulted in an improvement in final labels, with more accurate depiction of ABSTAINED data. 