In [85]:
from paper_reader import PaperDatasetReader

dataset_reader = PaperDatasetReader()
print("hello")
train_data = dataset_reader.read('data/papers/unlabeled_train.csv')
print("checkpoint")
dev_data = dataset_reader.read('data/papers/labeled_dev.csv')

test_data = dev_data[:35]
dev_data = dev_data[35:]

data = train_data + dev_data + test_data
print("done")

71it [00:00, 2676.07it/s]
91it [00:00, 5751.77it/s]

hello
data/papers/unlabeled_train.csv
checkpoint
data/papers/labeled_dev.csv
done





In [86]:
from wiser.viewer import Viewer
Viewer(test_data, height=120)

<IPython.core.display.Javascript object>

Viewer(html='<head>\n<style>\nspan.active {\n    background-color: skyblue;\n    box-shadow: 1px 1px 1px grey;…

In [87]:
from wiser.rules import TaggingRule
from wiser.rules import DictionaryMatcher

In [88]:
increase_keywords = [['enrichment'], ['increase'], ['increases'], ['increased'],
                     ['increasing'], ['rising'], ['ample'], ['stimulatory'],
                     ['amplified'], ['higher'], ['intensify'], ['elevated']]
                  
tr = DictionaryMatcher("IncreaseKeywords", 
                       terms=increase_keywords, 
                       i_label="I-INCREASE", 
                       uncased=True)
tr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [89]:
decrease_keywords = [['decrease'], ['decreased'], ['falling'], 
                     ['reduce'], ['reduced'], ['decline'], ['declines']]
                  
tr = DictionaryMatcher("DecreaseKeywords", 
                       terms=decrease_keywords, 
                       i_label="I-DECREASE",
                       uncased=True)
tr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [90]:
negation_keywords = [['negatively'], ['not'], ['inversely']]
                  
tr = DictionaryMatcher("NegationKeywords",
                       terms=negation_keywords, 
                       i_label="I-NEGATION", 
                       uncased=True)
tr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [91]:
cause_keywords = [['driven'], ['resulting'], ['cause'], ['causes'], ['caused'], ['shown'], 
                  ['led'], ['may'], ['considered'], ['sustained']]
                  
tr = DictionaryMatcher("CauseKeywords",
                       terms=cause_keywords, 
                       i_label="I-cause", 
                       uncased=True)
tr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [92]:
and_keywords = [['and']]
                  
tr = DictionaryMatcher("AndKeywords",
                       terms=and_keywords, 
                       i_label="I-AND", 
                       uncased=True)
tr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [93]:
non_entity_punctuation_chars = {'.', ';', '(', ')'}

tr = DictionaryMatcher("Non-EntityPunctuation", 
                       terms=non_entity_punctuation_chars, 
                       i_label="O")
tr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [94]:
import spacy

nlp = spacy.load("en_core_web_sm")
tagger = nlp.create_pipe("tagger")

potential_variables = {'NOUN', 'PROPN'}

class VariableWords(TaggingRule):
    
    def apply_instance(self, instance):

        tokens = [t.text for t in instance['tokens']]
        increase_tokens = [t for t in instance['WISER_LABELS']['IncreaseKeywords']]
        decrease_tokens = [t for t in instance['WISER_LABELS']['DecreaseKeywords']]
        
        def near_change_word(position):
            for i in range(position-2, position+3):
                if i < 0 or i >= len(tokens):
                    continue
                if increase_tokens[i] != 'ABS' or decrease_tokens[i] != 'ABS':
                    return True
        
        # We obtain the parts-of-speech from SpaCy
        parts_of_speech = [token[0].pos_ for token in nlp.pipe(tokens)]        
        labels = ['ABS'] * len(tokens)

        for i, (token, pos) in enumerate(zip(tokens, parts_of_speech)):
            if pos in potential_variables and near_change_word(i):
                labels[i] = 'I-VARIABLE'
                
        return labels

tr = VariableWords()
tr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [101]:
from wiser.eval import score_tagging_rules
score_tagging_rules(dev_data)

Unnamed: 0,TP,FP,FN,Token Acc.,Token Votes
AndKeywords,0,54,112,,54
CauseKeywords,0,8,112,,8
DecreaseKeywords,5,4,107,0.5556,9
IncreaseKeywords,10,12,102,0.4545,22
NegationKeywords,0,2,112,,2
Non-EntityPunctuation,0,0,112,0.9524,84
VariableWords,0,33,112,0.2826,46


In [102]:
from wiser.rules import LinkingRule

In [103]:
common_prepositions = {'a', 'in', 'the', 'at', 'with', 'of', 'by', '&', 'with'}

class CommonPrepositions(LinkingRule):

    def apply_instance(self, instance):
        tokens = [t.text for t in instance['tokens']]
        links = [0] * len(tokens)
        
        for i in range(1, len(tokens)-1):
            if tokens[i] in common_prepositions:
                links[i] = 1
                links[i-1] = 1
                links[i+1] = 1
        return links

lr = CommonPrepositions()
lr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [104]:
from wiser.rules import ElmoLinkingRule

In [105]:
# Links tokens whose cosine similarity is larger than 0.8
lr = ElmoLinkingRule(0.8)
lr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [110]:
linkers = {':', ';', '-'}

class PunctuationLinkers(LinkingRule):

    def apply_instance(self, instance):
        tokens = [t.text for t in instance['tokens']]
        links = [0] * len(tokens)
        
        for i in range(1, len(tokens)-1):
            if tokens[i] in linkers:
                
                # The linking punctuation character and it's succeeding character
                # share the same tag as the preceding one at index "i-1"
                links[i] = 1
                links[i+1] = 1
        return links

lr = PunctuationLinkers()
lr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [111]:
contraction_suffixes = {'\'s', '\'nt', '\'ve', '\'', '\'d'}

class Contractions(LinkingRule):

    def apply_instance(self, instance):
        tokens = [t.text for t in instance['tokens']]
        links = [0] * len(tokens)
        
        for i in range(1, len(tokens)):
            if tokens[i] in contraction_suffixes:
                links[i] = 1
        return links

lr = Contractions()
lr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [112]:
class ConsecutiveCapitals(LinkingRule):
    
    def apply_instance(self, instance):
        tokens = [t.text for t in instance['tokens']]
        links = [0] * len(tokens)
        
        for i in range(1, len(tokens)):
            if tokens[i].istitle() and tokens[i-1].istitle():
                links[i] = 1 # token at index "i" shares tag with token at index "i-1"
        return links

lr = ConsecutiveCapitals()
lr.apply(data)

HBox(children=(IntProgress(value=0, max=162), HTML(value='')))




In [113]:
from wiser.eval import score_linking_rules
score_linking_rules(train_data)

Unnamed: 0,Entity Links,Non-Entity Links,Incorrect Links,Accuracy
CommonPrepositions,0,1046,0,1.0
ConsecutiveCapitals,0,63,0,1.0
Contractions,0,0,0,
ElmoLinkingRule,0,3,0,1.0
PunctuationLinkers,0,92,0,1.0


In [120]:
from wiser.rules import remove_rule
remove_rule(data, 'AndKeywords')
remove_rule(data, 'CauseKeywords')
remove_rule(data, 'NegationKeywords')

In [121]:
import pickle

with open('output/tmp/train_data.p', 'wb') as f:
    pickle.dump(train_data, f)
with open('output/tmp/dev_data.p', 'wb') as f:
    pickle.dump(dev_data, f)
with open('output/tmp/test_data.p', 'wb') as f:
    pickle.dump(test_data, f)

### Checkpoint

In [4]:
import pickle
with open('output/tmp/train_data.p', 'rb') as f:
    train_data = pickle.load(f)

with open('output/tmp/dev_data.p', 'rb') as f:
    dev_data = pickle.load(f)
    
with open('output/tmp/test_data.p', 'rb') as f:
    test_data = pickle.load(f)

In [5]:
from wiser.viewer import Viewer
Viewer(dev_data, height=120)

<IPython.core.display.Javascript object>

Viewer(html='<head>\n<style>\nspan.active {\n    background-color: skyblue;\n    box-shadow: 1px 1px 1px grey;…

In [6]:
from wiser.eval import score_labels_majority_vote
score_labels_majority_vote(dev_data)

Unnamed: 0,TP,FP,FN,P,R,F1
Majority Vote,15,39,97,0.2778,0.1339,0.1807


In [7]:
from labelmodels import LinkedHMM
from wiser.generative import Model

model = Model(LinkedHMM)

In [8]:
from labelmodels import LearningConfig

config = LearningConfig()
config.num_epochs = 5

In [None]:
# Outputs the best development score
model.train(config, train_data=train_data, dev_data=dev_data)

In [None]:
model.evaluate(test_data)

In [None]:
model.save_output(data=train_data, path='output/generative/link_hmm/train_data.p', save_distribution=True)
model.save_output(data=dev_data, path='output/generative/link_hmm/dev_data.p', save_distribution=True, save_tags=True)
model.save_output(data=test_data, path='output/generative/link_hmm/test_data.p', save_distribution=True, save_tags=True)