In [1]:
import pandas as pd
import numpy as np
import os
import re

## Loading data

In [2]:
# load data into pandas
def read_data(dir_path):
    examples = []
    for filename in os.listdir(dir_path):
        if not filename.endswith("txt"):
            continue
        keys = filename.split(".")[0].split("_")
        assert len(keys) == 3
        # keys is [id, label, review_score]. For now we are only interested in the label
        label = keys[1]
        with open(os.path.join(dir_path, filename)) as f:
            text = f.read().strip().replace("<br />", " ")
        examples.append([text, 1 if label == 'pos' else 0])
    return examples

In [3]:
DEV_SIZE = 800

In [4]:
SDEV_DIR = '../imdb-data/sd{}'.format(DEV_SIZE)
sdev_data = read_data(SDEV_DIR)
sdev_df = pd.DataFrame(sdev_data, columns=['text', 'label'])
sdev_df

Unnamed: 0,text,label
0,It's beyond my comprehension that so much rubb...,1
1,'What I Like About You' is definitely a show t...,1
2,"ELVIRA, MISTRESS OF THE DARK (1988) directed ...",1
3,This is one of the most spiritual movies I hav...,1
4,"Man With the Gun is pretty much forgotten now,...",1
...,...,...
795,Police Story is one of Jackie Chan's classic f...,1
796,If you need cheering up on a cold weekday even...,1
797,A wonderful early musical film from Rene Clair...,1
798,"A light-hearted comedy, Nothing shows us a wor...",1


In [5]:
from sklearn.model_selection import train_test_split

TRAIN_DIR = '../imdb-data/og'
train_data = read_data(TRAIN_DIR)

all_train_df = pd.DataFrame(train_data, columns=['text', 'label'])
train_df = pd.concat([all_train_df, sdev_df]).drop_duplicates(keep=False)
train_df

Unnamed: 0,text,label
0,I have to start saying it has been a long time...,1
1,I thought that Mukhsin has been wonderfully wr...,1
2,First of all this was not a three hour movie -...,1
3,I cant understand at all why so many Godzilla ...,0
5,"""Hatred of a Minute"" is arguably one of the be...",1
...,...,...
24994,I agree with other users comments in that the ...,0
24995,This movie is about this wimpy guy who decides...,1
24997,I really enjoyed The 60's. Not being of that g...,1
24998,"While on a vacation at the beach, red-haired b...",0


## Get most indicative words using Naive Bayes on sdevset

In [6]:
from naive_bayes.imdb import return_keywords_indices

all_words, most_pos_indices, most_neg_indices = return_keywords_indices(sdev_data)

Size of dictionary:  3064


In [7]:
NUM_KEYWORDS = 50

top_pos_words = [all_words[ind] for ind in most_pos_indices[:NUM_KEYWORDS]]
top_neg_words = [all_words[ind] for ind in most_neg_indices[:NUM_KEYWORDS]]

In [8]:
print('Top pos words:')
print(top_pos_words)
print('\nTop neg words:')
print(top_neg_words)

# Remember to post-process these by removing those words that are correlational but not causal, e.g. 'germany'

Top pos words:
['eddie', 'jackie', 'douglas', 'everyday', 'stands', 'cold', 'ned', 'wonderful', 'built', 'wonderfully', 'spy', 'thrillers', 'professional', 'finest', 'gorgeous', 'wall', 'daniel', 'delightful', 'saturday', 'genuine', 'lonely', 'captures', 'glory', 'adapted', 'quiet', 'california', 'develops', 'captivating', 'counter', 'lucky', 'uncut', 'studios', 'strength', 'stunning', 'paris', 'flight', 'gordon', 'sentimental', 'melodrama', 'portrait', 'lovers', 'mrs', 'underground', 'critical', 'covered', 'brilliantly', 'chan', 'captured', 'builds', 'delight']

Top neg words:
['terrible', 'disappointing', 'waste', 'disappointment', 'villains', 'darkness', 'lowbudget', '310', 'ridiculous', 'nonexistent', 'pointless', 'joe', 'mess', 'toilet', 'amateur', 'remotely', 'forgettable', 'juvenile', 'ape', 'incomprehensible', 'garbage', 'horrendous', 'zero', 'unintentional', 'outer', 'cheated', 'offended', 'promise', 'ugly', 'wasted', 'atrocious', 'gotta', 'realized', 'unwatchable', 'tarzan', 

In [9]:
pos_words = set(['eddie', 'jackie', 'everyday', 'douglas', 'cold', 
                 'stands', 'ned', 'wonderful', 'spy', 'built', 
                 'wonderfully', 'wall', 'daniel', 'finest', 'thrillers',
                 'gorgeous', 'delightful', 'professional', 'lonely', 'genuine',
                 'captures', 'saturday', 'glory', 'adapted', 'counter', 'captivating',
                 'california', 'develops', 'lucky', 'strength', 'studios', 'uncut',
                 'quiet', 'stunning', 'paris', 'flight', 'gordon', 'underground',
                 'builds', 'brilliantly', 'captured', 'mrs', 'covered', 'portrait',
                 'delight', 'lovers', 'critical', 'intensity', 'melodrama', 'sentimental'])

neg_words = set(['terrible', 'disappointing', 'waste', 'lowbudget', '310', 'darkness',
                 'villains', 'disappointment', 'nonexistent', 'ridiculous', 'pointless',
                 'joe', 'mess', 'ape', 'horrendous', 'forgettable', 'juvenile', 'garbage',
                 'amateur', 'incomprehensible', 'remotely', 'toilet', 'zero', 'offended',
                 'gotta', 'unwatchable', 'realized', 'unintentional', 'outer', 'cheated',
                 'wasted', 'atrocious', 'ugly', 'promise', 'tarzan', 'awful', 'jane',
                 'looked', 'law', 'worst', 'pretentious', 'subjected', 'secondly',
                 'amazed', 'blatantly', 'riding', 'employed', 'woody', 'plague', 'incoherent'])

## Labeling functions

In [10]:
# define constants to represent class labels
ABSTAIN = -1
POS = 1
NEG = 0

In [15]:
from snorkel.labeling import labeling_function

@labeling_function()
def score_keywords(x):
    text = x.text.lower()
    pos_word_score = 1.0
    neg_word_score = -1.0
    score = 0
    for word in pos_words:
        if re.search(r'[.!?\-\s]' + word + r'[.!?\-\s]', text):
            score += pos_word_score
    for word in neg_words:
        if re.search(r'[.!?\-\s]' + word + r'[.!?\-\s]', text):
            score += neg_word_score
    if score > 0:
        return POS
    elif score < 0:
        return NEG
    return ABSTAIN

## Applying and tuning LFs

In [16]:
lfs = [score_keywords]

In [17]:
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis

applier = PandasLFApplier(lfs=lfs)

In [18]:
L_sdev = applier.apply(df=sdev_df)
LFAnalysis(L=L_sdev, lfs=lfs).lf_summary(Y=np.asarray(sdev_df["label"]))

100%|██████████| 800/800 [00:02<00:00, 373.97it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
score_keywords,0,"[0, 1]",0.5825,0.0,0.0,450,16,0.965665


In [19]:
L_train = applier.apply(df=train_df)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|██████████| 24016/24016 [00:55<00:00, 434.73it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
score_keywords,0,"[0, 1]",0.520778,0.0,0.0


## Use Majority Vote on sdev set (given there is only one lf)

In [24]:
from snorkel.labeling import MajorityLabelVoter

majority_model = MajorityLabelVoter()
majority_acc = majority_model.score(L=L_sdev, Y=np.asarray(sdev_df["label"]))["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")



Majority Vote Accuracy:   96.6%


In [25]:
# Attention: this part is pseudo-cheating!

majority_model_test_perf = majority_model.score(L=L_train, Y=np.asarray(train_df["label"]))
print(majority_model_test_perf)



{'accuracy': 0.7792436235708003}


## Filtering out unlabeled data points

In [26]:
from snorkel.labeling import filter_unlabeled_dataframe

In [27]:
model = majority_model

### Filter

In [28]:
probs_train = model.predict_proba(L=L_train)

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=train_df, y=probs_train, L=L_train
)

## Export dataframe

In [29]:
df_train_filtered

Unnamed: 0,text,label
0,I have to start saying it has been a long time...,1
1,I thought that Mukhsin has been wonderfully wr...,1
3,I cant understand at all why so many Godzilla ...,0
7,Pretty awful but watchable and entertaining. I...,0
9,Cannot believe my eyes when read quite a bunch...,1
...,...,...
24983,"As with all of Angelopoulos' films, ""The Suspe...",1
24989,SPOILER ALERT In this generic and forgettable ...,0
24992,The 1930s saw a vogue for documentary films ab...,0
24993,I managed to catch a late night double feature...,1


In [30]:
np.sum(np.asarray(df_train_filtered['label']))

5388

In [31]:
export_path = '../imdb-data/wd-n.csv'
df_train_filtered.to_csv(export_path, header=False, index=False)