In [2]:
import pandas as pd
import numpy as np
import os
import re
import snorkel

## Loading data

In [3]:
# load data into pandas
def read_data(dir_path):
    examples = []
    for filename in os.listdir(dir_path):
        if not filename.endswith("txt"):
            continue
        keys = filename.split(".")[0].split("_")
        assert len(keys) == 3
        # keys is [id, label, review_score]. For now we are only interested in the label
        label = keys[1]
        with open(os.path.join(dir_path, filename)) as f:
            text = f.read().strip().replace("<br />", " ")
        examples.append([text, 1 if label == 'pos' else 0])
    return examples

In [4]:
SDEV_DIR = '../imdb-data/sd1600'
sdev_data = read_data(SDEV_DIR)
sdev_df = pd.DataFrame(sdev_data, columns=['text', 'label'])
sdev_df

Unnamed: 0,text,label
0,It's beyond my comprehension that so much rubb...,1
1,Bonjour Tristesse covers similar ground as 'Th...,0
2,I have 2 words for you. Sean Bean. He is the o...,0
3,Big S isn't playing with taboos or forcing an ...,1
4,After seeing this film I complained to my loca...,0
...,...,...
1595,A wonderful early musical film from Rene Clair...,1
1596,"Nominated for the oscar ""worst script ever"" in...",0
1597,"A light-hearted comedy, Nothing shows us a wor...",1
1598,"This movie has its ups and downs, but to me th...",1


In [32]:
TRAIN_DIR = '../imdb-data/og'
train_data = read_data(TRAIN_DIR)
train_df = pd.DataFrame(train_data, columns=['text', 'label'])
train_df

Unnamed: 0,text,label
0,I have to start saying it has been a long time...,1
1,I thought that Mukhsin has been wonderfully wr...,1
2,First of all this was not a three hour movie -...,1
3,I cant understand at all why so many Godzilla ...,0
4,It's beyond my comprehension that so much rubb...,1
...,...,...
24995,This movie is about this wimpy guy who decides...,1
24996,**Warning! Spoilers Ahead!** This short is pa...,1
24997,I really enjoyed The 60's. Not being of that g...,1
24998,"While on a vacation at the beach, red-haired b...",0


## Labeling functions

In [33]:
# define constants to represent class labels
ABSTAIN = -1
POS = 1
NEG = 0

In [34]:
DISTANCE = 8

In [174]:
from snorkel.labeling import labeling_function

negative_inflection_words = ["but", "however", "otherwise"]
neg_adjs = set([
    'bad', 'worst', 'horrible', 'terrible', 'stupid', 'boring', 'dreadful', 'disgust',
    'disturbing', 'problem', 'disaster', 'a waste', 'not a fan'
])
pos_adjs = set([
  'good', 'best', 'great', 'awesome', 'perfect', 'clever', 'charming',
  'fascinating', 'pleasant', 'happy', 'hilarious', 'funny', 'wonderful', 'lovely'
])

pos_words = set(['stunning', 'wonderful', 'finest', 'professional', 'fate','crafted', 
                 'refreshing', 'tremendous', 'technology', 'genuine', 'wonderfully', 'favorites', 
                 'gorgeous', 'captivating', 'poignant', 
                 'segment', 'teaches', 'stayed', 'confronted', 
                 'perfection', 'peace', 'innocence', 'immensely', 'expensive',
                 'develops', 'covered', 'arrives', 'superbly', 'beaten'])

neg_words = set(['pointless', 'poorly', 'laughable', 'waste', 'mediocre',
                 'remotely', 'amateurish', 'drags', 'worst',
                 'blatantly', 'accents', 'garbage', 'terrible', 
                 'awful', 'wasting', 'lowbudget', 'horrible', 'infected',
                 'incomprehensible', 'attack', 'unwatchable', 'painfully',
                 'horrendous', 'forgettable', 'unfunny', 'pack', 'idiotic', 
                 'meaningless', 'zero', 'bland', 'crap', 'dire'])

@labeling_function()
def good(x):
    if re.search(r'not.{0,8}good', x.text.lower()):
        return NEG
    elif 'good' in x.text.lower():
        return POS
    return ABSTAIN

@labeling_function()
def bad(x):
    if re.search(r'not.{0,8}bad', x.text.lower()):
        return ABSTAIN
    elif 'bad' in x.text.lower():
        return NEG
    return ABSTAIN

@labeling_function()
def pos_adj(x):
    text = x.text.lower()

    for word in pos_adjs:
        char_index = text.find(word)
        if char_index != -1:
            substring = text[max(char_index - DISTANCE, 0):char_index]
            if ('not' in substring) or ('n\'t' in substring) :
                continue
            else:
                return POS
    return ABSTAIN


@labeling_function()
def neg_adj(x):
    text = x.text.lower()
    for word in neg_adjs:
        char_index = text.find(word)
        if char_index != -1:
            substring = text[max(char_index - DISTANCE, 0):char_index]
            if ('not' in substring) or ('n\'t' in substring) :
                continue
            else:
                return NEG
    return ABSTAIN


@labeling_function()
def detect_pos_words_from_naive_bayes(x):
    return POS if any(re.search(r'[.!?\-\s]' + word + r'[.!?\-\s]', x.text.lower()) \
                      for word in pos_words) else ABSTAIN
    #return POS if any(word in x.text.lower() for word in pos_words) else ABSTAIN

@labeling_function()
def detect_neg_words_from_naive_bayes(x):
    return NEG if any(re.search(r'[.!?\-\s]' + word + r'[.!?\-\s]', x.text.lower()) \
                      for word in neg_words) else ABSTAIN
    #return NEG if any(word in x.text.lower() for word in neg_words) else ABSTAIN
    
@labeling_function()    
def detect_pos_exclamation(x):
    return POS if all(not re.search(r'[.!?\-\s]' + word + r'[.!?\-\s]', x.text.lower()) for word in neg_words) \
                    and '!' in x.text.lower() else ABSTAIN
@labeling_function()                      
def detect_neg_exclamation(x):
    return NEG if all(not re.search(r'[.!?\-\s]' + word + r'[.!?\-\s]', x.text.lower()) for word in pos_words) \
                    and '!' in x.text.lower() else ABSTAIN

In [175]:
lfs = [bad, neg_adj, 
       detect_pos_words_from_naive_bayes, detect_neg_words_from_naive_bayes,
       detect_pos_exclamation, detect_neg_exclamation]

## Applying and tuning LFs

In [176]:
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis

applier = PandasLFApplier(lfs=lfs)

In [177]:
L_sdev = applier.apply(df=sdev_df)
LFAnalysis(L=L_sdev, lfs=lfs).lf_summary(Y=np.asarray(sdev_df["label"]))









  0%|          | 0/1600 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







  1%|▏         | 22/1600 [00:00<00:07, 215.48it/s][A[A[A[A[A[A[A[A







  3%|▎         | 43/1600 [00:00<00:07, 213.78it/s][A[A[A[A[A[A[A[A







  4%|▍         | 65/1600 [00:00<00:07, 214.45it/s][A[A[A[A[A[A[A[A







  5%|▌         | 86/1600 [00:00<00:07, 211.89it/s][A[A[A[A[A[A[A[A







  7%|▋         | 110/1600 [00:00<00:06, 216.96it/s][A[A[A[A[A[A[A[A







  8%|▊         | 130/1600 [00:00<00:07, 207.47it/s][A[A[A[A[A[A[A[A







  9%|▉         | 151/1600 [00:00<00:06, 207.51it/s][A[A[A[A[A[A[A[A







 11%|█         | 171/1600 [00:00<00:07, 200.47it/s][A[A[A[A[A[A[A[A







 12%|█▏        | 193/1600 [00:00<00:06, 205.04it/s][A[A[A[A[A[A[A[A







 13%|█▎        | 213/1600 [00:01<00:07, 192.66it/s][A[A[A[A[A[A[A[A







 15%|█▍        | 233/1600 [00:01<00:07, 190.78it/s][A[A[A[A[A[A[A[A







 16%|█▌    

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
bad,0,[0],0.23375,0.233125,0.055,268,106,0.716578
neg_adj,1,[0],0.451875,0.39375,0.093125,508,215,0.702628
detect_pos_words_from_naive_bayes,2,[1],0.175625,0.099375,0.06625,254,27,0.903915
detect_neg_words_from_naive_bayes,3,[0],0.321875,0.28,0.024375,456,59,0.885437
detect_pos_exclamation,4,[1],0.198125,0.198125,0.165,241,76,0.760252
detect_neg_exclamation,5,[0],0.26375,0.26375,0.14875,250,172,0.592417


In [178]:
L_train = applier.apply(df=train_df)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()









  0%|          | 0/25000 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







  0%|          | 19/25000 [00:00<02:12, 188.24it/s][A[A[A[A[A[A[A[A







  0%|          | 38/25000 [00:00<02:13, 186.63it/s][A[A[A[A[A[A[A[A







  0%|          | 61/25000 [00:00<02:06, 196.47it/s][A[A[A[A[A[A[A[A







  0%|          | 84/25000 [00:00<02:01, 204.82it/s][A[A[A[A[A[A[A[A







  0%|          | 103/25000 [00:00<02:04, 199.87it/s][A[A[A[A[A[A[A[A







  0%|          | 124/25000 [00:00<02:03, 202.08it/s][A[A[A[A[A[A[A[A







  1%|          | 145/25000 [00:00<02:02, 203.54it/s][A[A[A[A[A[A[A[A







  1%|          | 170/25000 [00:00<01:55, 214.78it/s][A[A[A[A[A[A[A[A







  1%|          | 197/25000 [00:00<01:48, 228.05it/s][A[A[A[A[A[A[A[A







  1%|          | 220/25000 [00:01<01:49, 225.31it/s][A[A[A[A[A[A[A[A







  1%|          | 245/25000 [00:01<01:46, 231.67it/s][A[A[A[A[A[A[A[A








 10%|▉         | 2445/25000 [00:10<02:10, 172.46it/s][A[A[A[A[A[A[A[A







 10%|▉         | 2463/25000 [00:10<02:10, 172.33it/s][A[A[A[A[A[A[A[A







 10%|▉         | 2482/25000 [00:10<02:07, 176.33it/s][A[A[A[A[A[A[A[A







 10%|█         | 2506/25000 [00:10<01:59, 188.98it/s][A[A[A[A[A[A[A[A







 10%|█         | 2540/25000 [00:10<01:43, 217.66it/s][A[A[A[A[A[A[A[A







 10%|█         | 2564/25000 [00:11<01:45, 213.56it/s][A[A[A[A[A[A[A[A







 10%|█         | 2587/25000 [00:11<01:46, 211.33it/s][A[A[A[A[A[A[A[A







 10%|█         | 2610/25000 [00:11<01:45, 212.57it/s][A[A[A[A[A[A[A[A







 11%|█         | 2635/25000 [00:11<01:40, 222.15it/s][A[A[A[A[A[A[A[A







 11%|█         | 2659/25000 [00:11<01:39, 225.31it/s][A[A[A[A[A[A[A[A







 11%|█         | 2683/25000 [00:11<01:37, 228.50it/s][A[A[A[A[A[A[A[A







 11%|█         | 2709/25000 [00:11<01:34, 236.05it/s][A[A[A[A

 19%|█▉        | 4688/25000 [00:20<01:37, 207.45it/s][A[A[A[A[A[A[A[A







 19%|█▉        | 4713/25000 [00:21<01:32, 218.40it/s][A[A[A[A[A[A[A[A







 19%|█▉        | 4737/25000 [00:21<01:30, 223.33it/s][A[A[A[A[A[A[A[A







 19%|█▉        | 4760/25000 [00:21<01:31, 222.38it/s][A[A[A[A[A[A[A[A







 19%|█▉        | 4786/25000 [00:21<01:27, 231.47it/s][A[A[A[A[A[A[A[A







 19%|█▉        | 4811/25000 [00:21<01:25, 235.74it/s][A[A[A[A[A[A[A[A







 19%|█▉        | 4835/25000 [00:21<01:28, 227.26it/s][A[A[A[A[A[A[A[A







 19%|█▉        | 4858/25000 [00:21<01:29, 224.91it/s][A[A[A[A[A[A[A[A







 20%|█▉        | 4881/25000 [00:21<01:29, 226.01it/s][A[A[A[A[A[A[A[A







 20%|█▉        | 4904/25000 [00:21<01:32, 217.58it/s][A[A[A[A[A[A[A[A







 20%|█▉        | 4930/25000 [00:21<01:27, 228.49it/s][A[A[A[A[A[A[A[A







 20%|█▉        | 4954/25000 [00:22<01:28, 227.22it/s][A[A[A[A

 28%|██▊       | 7023/25000 [00:31<01:13, 243.12it/s][A[A[A[A[A[A[A[A







 28%|██▊       | 7049/25000 [00:31<01:12, 247.66it/s][A[A[A[A[A[A[A[A







 28%|██▊       | 7075/25000 [00:31<01:11, 249.77it/s][A[A[A[A[A[A[A[A







 28%|██▊       | 7101/25000 [00:31<01:12, 245.69it/s][A[A[A[A[A[A[A[A







 29%|██▊       | 7126/25000 [00:31<01:16, 233.30it/s][A[A[A[A[A[A[A[A







 29%|██▊       | 7150/25000 [00:31<01:22, 217.36it/s][A[A[A[A[A[A[A[A







 29%|██▊       | 7174/25000 [00:31<01:20, 221.49it/s][A[A[A[A[A[A[A[A







 29%|██▉       | 7198/25000 [00:31<01:19, 224.88it/s][A[A[A[A[A[A[A[A







 29%|██▉       | 7225/25000 [00:32<01:15, 235.81it/s][A[A[A[A[A[A[A[A







 29%|██▉       | 7251/25000 [00:32<01:14, 239.28it/s][A[A[A[A[A[A[A[A







 29%|██▉       | 7279/25000 [00:32<01:11, 248.63it/s][A[A[A[A[A[A[A[A







 29%|██▉       | 7305/25000 [00:32<01:13, 240.54it/s][A[A[A[A

 38%|███▊      | 9431/25000 [00:41<01:07, 231.57it/s][A[A[A[A[A[A[A[A







 38%|███▊      | 9455/25000 [00:41<01:08, 226.68it/s][A[A[A[A[A[A[A[A







 38%|███▊      | 9480/25000 [00:41<01:06, 231.84it/s][A[A[A[A[A[A[A[A







 38%|███▊      | 9504/25000 [00:41<01:08, 225.33it/s][A[A[A[A[A[A[A[A







 38%|███▊      | 9530/25000 [00:41<01:05, 234.50it/s][A[A[A[A[A[A[A[A







 38%|███▊      | 9554/25000 [00:41<01:07, 229.23it/s][A[A[A[A[A[A[A[A







 38%|███▊      | 9578/25000 [00:42<01:06, 232.18it/s][A[A[A[A[A[A[A[A







 38%|███▊      | 9602/25000 [00:42<01:06, 231.35it/s][A[A[A[A[A[A[A[A







 39%|███▊      | 9629/25000 [00:42<01:03, 241.61it/s][A[A[A[A[A[A[A[A







 39%|███▊      | 9654/25000 [00:42<01:06, 232.36it/s][A[A[A[A[A[A[A[A







 39%|███▊      | 9678/25000 [00:42<01:09, 221.26it/s][A[A[A[A[A[A[A[A







 39%|███▉      | 9706/25000 [00:42<01:06, 231.04it/s][A[A[A[A

 47%|████▋     | 11784/25000 [00:51<01:01, 216.25it/s][A[A[A[A[A[A[A[A







 47%|████▋     | 11806/25000 [00:51<01:04, 206.04it/s][A[A[A[A[A[A[A[A







 47%|████▋     | 11827/25000 [00:51<01:05, 202.08it/s][A[A[A[A[A[A[A[A







 47%|████▋     | 11848/25000 [00:51<01:06, 198.09it/s][A[A[A[A[A[A[A[A







 47%|████▋     | 11873/25000 [00:51<01:02, 209.41it/s][A[A[A[A[A[A[A[A







 48%|████▊     | 11898/25000 [00:52<00:59, 219.35it/s][A[A[A[A[A[A[A[A







 48%|████▊     | 11923/25000 [00:52<00:57, 226.07it/s][A[A[A[A[A[A[A[A







 48%|████▊     | 11946/25000 [00:52<00:57, 227.06it/s][A[A[A[A[A[A[A[A







 48%|████▊     | 11972/25000 [00:52<00:55, 234.57it/s][A[A[A[A[A[A[A[A







 48%|████▊     | 11998/25000 [00:52<00:54, 239.82it/s][A[A[A[A[A[A[A[A







 48%|████▊     | 12023/25000 [00:52<00:55, 235.61it/s][A[A[A[A[A[A[A[A







 48%|████▊     | 12047/25000 [00:52<00:59, 215.96it/s]

 56%|█████▋    | 14101/25000 [01:01<00:46, 236.31it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14130/25000 [01:01<00:43, 248.39it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14156/25000 [01:02<00:43, 250.93it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14182/25000 [01:02<00:42, 253.03it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14208/25000 [01:02<00:43, 248.67it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14234/25000 [01:02<00:44, 242.61it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14259/25000 [01:02<00:47, 228.26it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14283/25000 [01:02<00:49, 218.67it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14306/25000 [01:02<00:50, 210.59it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14328/25000 [01:02<00:52, 204.98it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14349/25000 [01:02<00:52, 204.16it/s][A[A[A[A[A[A[A[A







 57%|█████▋    | 14370/25000 [01:03<00:55, 191.36it/s]

 64%|██████▍   | 15978/25000 [01:12<00:47, 189.67it/s][A[A[A[A[A[A[A[A







 64%|██████▍   | 15998/25000 [01:12<00:49, 180.87it/s][A[A[A[A[A[A[A[A







 64%|██████▍   | 16020/25000 [01:12<00:47, 191.00it/s][A[A[A[A[A[A[A[A







 64%|██████▍   | 16042/25000 [01:12<00:45, 196.13it/s][A[A[A[A[A[A[A[A







 64%|██████▍   | 16070/25000 [01:12<00:41, 214.01it/s][A[A[A[A[A[A[A[A







 64%|██████▍   | 16094/25000 [01:13<00:40, 218.54it/s][A[A[A[A[A[A[A[A







 64%|██████▍   | 16117/25000 [01:13<00:50, 176.15it/s][A[A[A[A[A[A[A[A







 65%|██████▍   | 16137/25000 [01:13<00:53, 164.58it/s][A[A[A[A[A[A[A[A







 65%|██████▍   | 16163/25000 [01:13<00:48, 183.62it/s][A[A[A[A[A[A[A[A







 65%|██████▍   | 16183/25000 [01:13<00:47, 184.14it/s][A[A[A[A[A[A[A[A







 65%|██████▍   | 16203/25000 [01:13<00:49, 178.61it/s][A[A[A[A[A[A[A[A







 65%|██████▍   | 16228/25000 [01:13<00:45, 194.37it/s]

 73%|███████▎  | 18127/25000 [01:23<00:26, 258.06it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 18155/25000 [01:23<00:25, 264.20it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 18183/25000 [01:23<00:26, 259.72it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 18214/25000 [01:23<00:25, 267.26it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 18242/25000 [01:23<00:26, 257.95it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 18273/25000 [01:24<00:24, 271.61it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 18301/25000 [01:24<00:24, 268.67it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 18329/25000 [01:24<00:26, 256.14it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 18355/25000 [01:24<00:26, 253.47it/s][A[A[A[A[A[A[A[A







 74%|███████▎  | 18381/25000 [01:24<00:26, 250.95it/s][A[A[A[A[A[A[A[A







 74%|███████▎  | 18410/25000 [01:24<00:25, 260.73it/s][A[A[A[A[A[A[A[A







 74%|███████▎  | 18437/25000 [01:24<00:25, 253.06it/s]

 82%|████████▏ | 20574/25000 [01:34<00:22, 194.31it/s][A[A[A[A[A[A[A[A







 82%|████████▏ | 20594/25000 [01:34<00:22, 192.47it/s][A[A[A[A[A[A[A[A







 82%|████████▏ | 20614/25000 [01:34<00:24, 179.18it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20633/25000 [01:34<00:25, 173.85it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20658/25000 [01:34<00:22, 190.61it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20688/25000 [01:34<00:20, 211.42it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20711/25000 [01:34<00:19, 214.69it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20734/25000 [01:35<00:20, 209.43it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20756/25000 [01:35<00:21, 200.39it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20778/25000 [01:35<00:20, 204.29it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20804/25000 [01:35<00:19, 216.45it/s][A[A[A[A[A[A[A[A







 83%|████████▎ | 20830/25000 [01:35<00:18, 227.79it/s]

 91%|█████████ | 22746/25000 [01:44<00:10, 205.08it/s][A[A[A[A[A[A[A[A







 91%|█████████ | 22768/25000 [01:44<00:10, 208.54it/s][A[A[A[A[A[A[A[A







 91%|█████████ | 22789/25000 [01:44<00:10, 207.39it/s][A[A[A[A[A[A[A[A







 91%|█████████ | 22810/25000 [01:44<00:10, 207.66it/s][A[A[A[A[A[A[A[A







 91%|█████████▏| 22832/25000 [01:45<00:10, 210.99it/s][A[A[A[A[A[A[A[A







 91%|█████████▏| 22856/25000 [01:45<00:09, 218.51it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 22879/25000 [01:45<00:09, 220.15it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 22902/25000 [01:45<00:09, 216.30it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 22924/25000 [01:45<00:09, 213.60it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 22946/25000 [01:45<00:09, 213.57it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 22968/25000 [01:45<00:09, 211.73it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 22990/25000 [01:45<00:09, 212.99it/s]

100%|█████████▉| 24905/25000 [01:54<00:00, 200.76it/s][A[A[A[A[A[A[A[A







100%|█████████▉| 24926/25000 [01:54<00:00, 199.56it/s][A[A[A[A[A[A[A[A







100%|█████████▉| 24948/25000 [01:55<00:00, 204.24it/s][A[A[A[A[A[A[A[A







100%|█████████▉| 24970/25000 [01:55<00:00, 207.73it/s][A[A[A[A[A[A[A[A







100%|██████████| 25000/25000 [01:55<00:00, 216.86it/s][A[A[A[A[A[A[A[A


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
bad,0,[0],0.23448,0.23296,0.068
neg_adj,1,[0],0.44492,0.39444,0.12612
detect_pos_words_from_naive_bayes,2,[1],0.19572,0.12708,0.09228
detect_neg_words_from_naive_bayes,3,[0],0.32976,0.28624,0.05192
detect_pos_exclamation,4,[1],0.21528,0.21528,0.18048
detect_neg_exclamation,5,[0],0.27088,0.27088,0.16448


## Testing Majority Vote (baseline) on sdev set

In [179]:
from snorkel.labeling import MajorityLabelVoter

majority_model = MajorityLabelVoter()
majority_acc = majority_model.score(L=L_sdev, Y=np.asarray(sdev_df["label"]))["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")



Majority Vote Accuracy:   79.6%


In [180]:
majority_model_test_acc = majority_model.score(L=L_train, Y=np.asarray(train_df["label"]))["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_model_test_acc * 100:.1f}%")



Majority Vote Accuracy:   76.2%


## Training Generative Model on train set and testing it on sdev set

In [181]:
from snorkel.labeling import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, lr=0.001, log_freq=100, seed=123)

In [182]:
label_model_acc = label_model.score(L=L_sdev, Y=np.asarray(sdev_df["label"]))["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")



Label Model Accuracy:     75.4%


In [183]:
label_model_test_acc = label_model.score(L=L_train, Y=np.asarray(train_df["label"]))["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_test_acc * 100:.1f}%")



Label Model Accuracy:     73.4%
