# Distant Supervision

Create labeling functions based on distant supervision. For this purpose we will build a knowledge base containing known software names, which is used as a basis for extracting software mentions (hence the distant supervising). 
We work with a software dictionary (positive hints) queried from wikidata (and libraries.io). Additionally we use a dictionary of known english words as negative hints.

## Base setup

In [None]:
%load_ext autoreload
%autoreload 2

import os
import re
import string
import math
import warnings
import numpy as np
import pandas as pd

from glob import glob
from shutil import copy
from difflib import SequenceMatcher
from functools import partial, update_wrapper 

BASE_NAME = 'sosci_ssc_0' 
DATABASE_NAME = 'sosci_ssc_0' 
LABELS_NAME = 'sosci_ssc_annotation' 
os.environ['SNORKELDB'] = 'postgres://snorkel:snorkel@localhost/' + DATABASE_NAME

from snorkel import SnorkelSession
from snorkel.models import candidate_subclass
from snorkel.annotations import load_gold_labels
from snorkel.learning.utils import MentionScorer
from snorkel.viewer import SentenceNgramViewer
from itertools import product

set_mapping = {
    'train': 0, 
    'test': 1,
    'new': 2
}

In [None]:
session = SnorkelSession()
software = candidate_subclass('software', ['software'])
devel_gold_labels = load_gold_labels(session, annotator_name='gold', split=set_mapping['train'])

from snorkel.lf_helpers import (
    contains_token, get_between_tokens, get_doc_candidate_spans,
    get_left_tokens, get_matches, get_right_tokens, 
    get_sent_candidate_spans, get_tagged_text, get_text_between, 
    get_text_splits, is_inverted
)

In [None]:
test_cands = session.query(software).filter(software.split==set_mapping['train']).all()
test_labels = load_gold_labels(session, annotator_name="gold", split=set_mapping['train'])
scorer = MentionScorer(test_cands, test_labels)

## Helper Functions

**Important**: the `test_LF` function is not imported, because it has hard coded queries and does not evaluate the results in a meaningful way.

## Loading Libraries

The details of the used WikiData query can be found at `./distant_supervision/wiki_data.md`.

We use the standard english dictionary Duck et al. used and the acronym dictionary to possibly exclude false positives. 

In [None]:
wiki_data_software_list_01_09 = pd.read_csv("distant_supervision/wikidata_query_result.csv")
known_software_01_09 = set(wiki_data_software_list_01_09['itemLabel'].tolist())
known_software_altNames_01_09 = set(wiki_data_software_list_01_09.dropna(subset=['abbreviation'])['abbreviation'].tolist())
known_software_01_09 = known_software_01_09.union(known_software_altNames_01_09)
to_remove_01_09 = []
known_software_lower_01_09 = set()
for soft in known_software_01_09:
    if re.match(r'Q\d[1,10]', soft):
        to_remove_01_09.append(soft)
    elif len(soft) > 2:
        known_software_lower_01_09.add(soft.lower())
for wrong_software_name in to_remove_01_09:
    known_software_01_09.remove(wrong_software_name)
print("Loaded a list of " + str(len(known_software_01_09)) + " unique software names from Wikidata.")

duck_dict = set()
duck_dict_first_char_upper = set()
duck_dict_lower = set()
with open('distant_supervision/english.dic', encoding='iso8859-1') as eng_dic:
    for line in eng_dic:
        duck_dict.add(line.split('\n')[0])
        duck_dict_lower.add(line.split('\n')[0].lower())
        duck_dict_first_char_upper.add(line.split('\n')[0])
        duck_dict_first_char_upper.add(line.split('\n')[0].capitalize())
print("Duck dict contained " + str(len(duck_dict)) + " English words.")
print("and in Duck capitalized dict are " + str(len(duck_dict_first_char_upper)))
with open('distant_supervision/countries.dic') as countries_dic:
    for line in countries_dic:
        duck_dict.add(line.split('\n')[0])
        duck_dict_first_char_upper.add(line.split('\n')[0])
with open('distant_supervision/metric_prefix.dic') as prefix_dic:
    for line in prefix_dic:
        duck_dict.add(line.split('\n')[0])
        duck_dict_first_char_upper.add(line.split('\n')[0])
print("With explicit included words Duck dict contains now " + str(len(duck_dict)) + " English words.")

spell_right_dict = set()
spell_right_dict_lower = set()
with open('distant_supervision/spellright.dic') as eng_dic:
    for line in eng_dic:
        if '/' in line:
            spell_right_dict.add(line.split('/')[0])
            spell_right_dict_lower.add(line.split('/')[0].lower())
        else:
            spell_right_dict.add(line.split('\n')[0])
            spell_right_dict_lower.add(line.split('\n')[0].lower())
print("Spellright dict contained " + str(len(spell_right_dict)) + " English words.")

acronym_dic = set()
with open('distant_supervision/acronyms.dic') as acro_dic:
    for line in acro_dic:
        acronym_dic.add(line.split('\n')[0])
print("Loaded a set of " + str(len(acronym_dic)) + " Acronyms.")

alpha = ['T', 'C', 'G', 'A']
gen_seq_triplets = [''.join(seq) for seq in product(alpha, repeat = 3)]

Now just a simple save so that we can easily get the values from other python scripts or notebooks in the same enviornment.

In [None]:
%store known_software_01_09
%store duck_dict_first_char_upper
%store known_software_lower_01_09
%store duck_dict_lower
%store acronym_dic
%store gen_seq_triplets

import pickle
pickle.dump(known_software_01_09, open("known_software_01_09.p", "wb"))
pickle.dump(duck_dict_first_char_upper, open("duck_dict_first_char_upper.p", "wb"))
pickle.dump(known_software_lower_01_09, open("known_software_lower_01_09.p", "wb"))
pickle.dump(duck_dict_lower, open("duck_dict_lower.p", "wb"))
pickle.dump(acronym_dic, open("acronym_dic.p", "wb"))
pickle.dump(gen_seq_triplets, open("gen_seq_triplets.p", "wb"))

## Creating the Function

- WikiData is the basis for distant supervision
- But the exhaustive list matches to much: include a dictionary of plain english words which will serve to exlude WikiData matches
- Duck et al. Dictionary is actually better than a free web resource. Both in number of tp and fp. 
- Recall numbers are actually quite good 
- Precision is still not satisfactory
- Next consideration: Also include a dictionary of abbreviations to include? (of course not software abbreviations) -> not helpful
- Do partial matches help? -> Increase run time way to much to be helpful

Observations:
- Some false positives are caused through 'normal' words which stand on the beginning of the sentence, e.g. Random, Motion, because the dictionary only contains them lowercased. A possible solution could be to include first letter upper case to the dictionary: without this we have a recognition of tp 274, fp 774 and fn 135. With it we have: 240, 486, 169. This trade of could definitely be considered worth it.
- A better tokenization would actually help a lot. A large source of false positive is splitting of tokens on hypentation. The question is how to resolve this: best would be in spacy since it is so nicely integrated in snorkel. It is possible to create custom rules from an existing Spacy parser (following this post https://support.prodi.gy/t/how-to-tell-spacy-not-to-split-any-intra-hyphen-words/1456/4 and the docs https://spacy.io/usage/linguistic-features#native-tokenizer-additions). Tokenization had to be adjusted INSIDE THE SNORKEL CODE WE BUILT FROM SOURCE.
- A single small r is matched. This is a problem and is happening because of the lowercased software names. One solution could be to only lower case names which are longer than 2 or 3 characters, this would probably allow to exclude abbreviations which should not be lowercased and are probably mostly used correctly. 
- 'Switzerland' is in software names. This is strange but could be easily fixed by adding country names explicitly to the dictionary (which does not contain them right now).  
- A lot of false positives acutally come from mentioning of conventional software, where the developers name matches the software name and is mentioned a second time in brackets. This is impossible to exlude with distant supervision alone. 
- 'Review Board' (which is an actual software) is a source of a lot of false positives. This could maybe be addressed by matching 'Institutional' or 'Institute' before the actual word, becaues in that case it does commonly not refer to the software (the rule is applied in general not just on a specific target, either way this might boarder on a overspecificiation). 
- Consider lower cased software names? This could be problematic or helping: With considering them we have 240, 444, 169. Without considering them we have . Maybe do not lowercase all candidates?? Exlude those which are exlusively upper case or have more than one upper case letters? This does actually undermine the purpose of the initial lower match, but could also find to help a trade of for its application. 
- Maybe exclude words that are directly in brackets? Include **all types of brackets**. (Maybe also just a left bracket, followed by commas and closing in at least 6 tokens?? All of those points need individual testing. --- **do not apply**). 
- Explicitly adding metric prefixes which are sometimes mentioned in text and ambiguous with software.. add them to dictionary explicitly
- A lot of errors that remain are acutally reocurring specific words: 'GPs', 'Tween', 'California', 'Geneva', 'ART' (this actually also appears as true positives), 'actin', 'PubMed', 'R' (in the wrong context), 'control groups', 'NaCl', 'ELISA'
- **.get_span()** method DOES **NOT PERFORM AS EXPECTED**: does not acutally return the entire span! Quickfixing this brings 50 less false positives.

In [None]:
# Main Distance learning function
def LF_distant_supervision(c, software_dict, software_dict_lower, english_dict, english_dict_lower, acronym_dict, gen_seqs):
    cand = c[0].get_span()
    tokens = [x.lower() for x in c[0].get_attrib_tokens()]
    if len(tokens) == 1 and len(tokens[0]) != len(cand):
        return 0
    omissions = ['California', 'NaCl', 'control groups', 'FID', 'ELISA', 'GPs', 'PubMed', 'Gaussian', 'synaptic', 'vivo', 'ionic']
    if cand in omissions:
        return -1
    if len(cand) == 2 or cand.isdigit() or all(char in string.punctuation for char in cand):
        return -1
    cand_lower = cand.lower()
    cand_in_known_software = cand in software_dict
    cand_in_english_dic = cand in english_dict # english_dict
    cand_lower_match_known_software = cand_lower in software_dict_lower
    cand_lower_match_english_dic = cand_lower in english_dict_lower # english_dict_lower
    cand_is_acronym = cand in acronym_dict
    cand_is_gen_seq = cand in gen_seqs
    
    left_tokens = [x for x in get_left_tokens(c, window=1)]
    right_tokens = [x for x in get_right_tokens(c, window=1)]

    if ('institutional' in left_tokens or 
        'institution' in left_tokens or 
        'ethics' in left_tokens or 
        'ethic' in left_tokens or 
        (len(left_tokens) > 0 and len(right_tokens) > 0 and left_tokens[-1] in ['(', '[', '{'] and right_tokens[0] in [')', ']', '}'])):
        return -1
    
    if cand_is_gen_seq:
        return -1
    if cand_in_english_dic:
        if cand_in_known_software:
            return 0#-1
        else: 
            return -1
    else:
        if cand_in_known_software:
            return 1
        elif cand_lower_match_known_software:
            return 1
        elif cand_lower_match_english_dic:
            return 0 # -1 
        else:
            return 0

In [None]:
LF_dist = partial(LF_distant_supervision, 
                  software_dict=known_software_01_09,
                  software_dict_lower=known_software_lower_01_09,
                  english_dict=duck_dict_first_char_upper,
                  english_dict_lower=duck_dict_lower,
                  acronym_dict=acronym_dic,
                  gen_seqs=gen_seq_triplets)

lf = LF_dist
test_marginals  = np.array([0.5 * (lf(c) + 1) for c in test_cands])
tp, fp, tn, fn = scorer.score(test_marginals, set_unlabeled_as_neg=True, set_at_thresh_as_neg=False)

In [None]:
SentenceNgramViewer(fp, session)