In [2]:
from gensim.models import Word2Vec
import pickle
import random

from common.constants import file_paths
from common.constants import file_suffixes
from common.constants import regex_patterns

In [2]:
CONCEPTS_SIZE = 800
SAMPLE_SIZE = 100

HAND_LABEL_ECON_SAMPLES = False
HAND_LABEL_AUTOPHRASE_SAMPLES = False
HAND_LABEL_PRDR_SAMPLES = False

In [5]:
good_phrases = set(pickle.load(open('data/positive_samples.pkl', 'rb')))
print(f'Loaded {len(good_phrases)} good phrases.')

Loaded 1900 good phrases.


In [4]:
def get_file_path(file_suffix):
    extensionless_input_file_name = file_paths.INPUT_FILE.split('.')[0]
    return f'{file_paths.DATA_DIR}/{extensionless_input_file_name}{file_suffix}'

In [5]:
econ_embedding_path = get_file_path(file_suffixes.ECON_EMBEDDING)
autophrase_results_path = get_file_path(file_suffixes.AUTOPHRASE_RESULTS)
prdr_results_path = get_file_path('_prdr_results.txt')

In [6]:
def get_raw_concept(tagged_econ_concept):
    return tagged_econ_concept.replace('_', ' ')[3:-4]

In [7]:
if HAND_LABEL_ECON_SAMPLES:
    model = Word2Vec.load(econ_embedding_path)
    econ_concepts = [get_raw_concept(w) for w in model.wv.index2word if regex_patterns.CONCEPT_TAGGED_PHRASE.match(w)]
    print(f'Loaded {len(econ_concepts)} ECON concepts.')
    econ_samples = random.sample(econ_concepts, SAMPLE_SIZE)

In [8]:
if HAND_LABEL_AUTOPHRASE_SAMPLES:
    with open(autophrase_results_path, 'r') as f:
        autophrase_concepts = f.read().split('\n')[:-1]
        f.close()
    print(f'Loaded {len(autophrase_concepts)} AutoPhrase concepts.')
    autophrase_samples = random.sample(autophrase_concepts, SAMPLE_SIZE)

In [9]:
if HAND_LABEL_PRDR_SAMPLES:
    prdr_concepts = [line.split('\t')[0] for line in open(prdr_results_path, 'r').read().split('\n')[:-1]]
    print(f'Loaded {len(prdr_concepts)} concepts extracted via PRDR.')
    prdr_samples = random.sample(prdr_concepts, SAMPLE_SIZE)

In [10]:
if HAND_LABEL_ECON_SAMPLES:
    labeled_econ_samples = []
    num_hl_positive = 0
    num_gp_positive = 0
    for sample in econ_samples:
        label = input(f'{sample}: ')
        labeled_econ_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
        if sample in good_phrases:
            num_gp_positive += 1
    pickle.dump(labeled_econ_samples, open('data/labeled_econ_samples.pkl', 'wb'))

In [11]:
if HAND_LABEL_AUTOPHRASE_SAMPLES:
    labeled_autophrase_samples = []
    num_hl_positive = 0
    num_gp_positive = 0
    for sample in autophrase_samples:
        label = input(f'{sample}: ')
        labeled_autophrase_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
        if sample in good_phrases:
            num_gp_positive += 1
    pickle.dump(labeled_autophrase_samples, open('data/labeled_autophrase_samples.pkl', 'wb'))

In [12]:
if HAND_LABEL_PRDR_SAMPLES:
    labeled_prdr_samples = []
    num_hl_positive = 0
    num_gp_positive = 0
    for sample in prdr_samples:
        label = input(f'{sample}: ')
        labeled_prdr_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
        if sample in good_phrases:
            num_gp_positive += 1
    pickle.dump(labeled_prdr_samples, open('data/labeled_prdr_samples.pkl', 'wb'))

In [13]:
labeled_econ_samples = pickle.load(open('data/labeled_econ_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in labeled_econ_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'ECON Hand-Labeled Precision: {num_hl_positive / len(labeled_econ_samples)}')
print(f'ECON Good Phrases Precision: {num_gp_positive / len(labeled_econ_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

ECON Hand-Labeled Precision: 0.57
ECON Good Phrases Precision: 0.05
['inverse problem', 'boosted trees', 'memory management', 'principal components analysis', 'web applications']
['left ventricle', 'irrelevant features', 'outperform existing', 'starting point', 'mid level']


In [14]:
labeled_autophrase_samples = pickle.load(open('data/labeled_autophrase_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in labeled_autophrase_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'AutoPhrase Hand-Labeled Precision: {num_hl_positive / len(labeled_autophrase_samples)}')
print(f'AutoPhrase Good Phrases Precision: {num_gp_positive / len(labeled_autophrase_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

AutoPhrase Hand-Labeled Precision: 0.55
AutoPhrase Good Phrases Precision: 0.03
['motion planning', 'predictive power', 'theoretical foundation', 'shared memory', 'handwriting recognition']
['brain imaging', 'i d', 'level features', 'road rash', 'labeled and unlabeled']


In [15]:
labeled_prdr_samples = pickle.load(open('data/labeled_prdr_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in labeled_prdr_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'PRDR Hand-Labeled Precision: {num_hl_positive / len(labeled_prdr_samples)}')
print(f'PRDR Good Phrases Precision: {num_gp_positive / len(labeled_prdr_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

PRDR Hand-Labeled Precision: 0.65
PRDR Good Phrases Precision: 0.11
['recursive matching structure', 'mnist', 'high level abstractions', 'methods', 'domain']
['different network architectures', 'recent output representations', 'stochastic loss selection', 'significant improvement', 'neural network sentence']


In [16]:
labeled_samples = labeled_econ_samples + labeled_autophrase_samples + labeled_prdr_samples

In [17]:
print('Hand-Labeled Good but not Ground Truth\n')
for ls in labeled_samples:
    if ls[1] == '1' and ls[0] not in good_phrases:
        print(ls[0])

Hand-Labeled Good but not Ground Truth

open set recognition
smooth function
boosted trees
geometric brownian motion
probabilistic program
estimation error
cifar 100
click through rate
standard deviation
long term memory
virtual worlds
nlp applications
group lasso
minimum spanning tree
disentangled representations
inverse problem
sampling distribution
markov random field
null hypothesis
joint probability
mixture components
performance gain
kernel approximation
deep reinforcement learning rl
functional analysis
restricted boltzmann machine
recognition tasks
novelty detection
potts model
predictive analytics
web browsers
principal components analysis
disjoint sets
training instances
cutting plane
pattern mining
weak classifiers
deep q networks
training phase
hyperbolic space
target dataset
symbolic computation
image datasets
configuration space
computational overhead
submodular optimization
control problems
web applications
moment matching
np hard
imagenet classification
graph database
p

In [18]:
print('Ground Truth but not Hand-Labeled Good\n')
for ls in labeled_samples:
    if ls[1] == '0' and ls[0] in good_phrases:
        print(ls[0])

Ground Truth but not Hand-Labeled Good

learning
methods
task


## Phraseness Evaluation

In [7]:
labeled_econ_samples = pickle.load(open('data/labeled_econ_samples.pkl', 'rb'))
labeled_autophrase_samples = pickle.load(open('data/labeled_autophrase_samples.pkl', 'rb'))
labeled_prdr_samples = pickle.load(open('data/labeled_prdr_samples.pkl', 'rb'))

In [8]:
phraseness_labeled_econ_samples = []
num_hl_positive = 0
num_gp_positive = 0
for sample, l in labeled_econ_samples:
    if l != '1':
        label = input(f'{sample}: ')
        phraseness_labeled_econ_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
    if sample in good_phrases:
        num_gp_positive += 1
pickle.dump(phraseness_labeled_econ_samples, open('data/phraseness_labeled_econ_samples.pkl', 'wb'))

highly dependent:  0
theoretical and empirical:  0
sparsity inducing:  0
moving objects:  1
rank tensor:  0
promising results:  1
continuous action:  1
recently gained:  0
structural constraints:  1
crop yield:  1
optimal regret:  1
outperform existing:  0
malignant tumor:  1
previous approaches:  1
positive and negative:  0
multiple myeloma:  1
count based:  0
task oriented:  0
mid level:  0
at large:  0
gamma ray:  1
applications of artificial intelligence:  1
theoretically analyze:  0
number of arms:  0
gas turbine:  1
based regularization:  0
vending machines:  1
exponentially large:  0
few shot:  0
sufficient statistics:  1
irrelevant features:  1
traditional medicine:  1
set theoretic:  0
competitive inhibition:  1
image level:  0
starting point:  1
gray matter:  1
pablo picasso:  1
lancaster university:  1
a long standing:  0
spin glass:  1
flow cytometry:  1
left ventricle:  1


In [9]:
phraseness_labeled_autophrase_samples = []
num_hl_positive = 0
num_gp_positive = 0
for sample, l in labeled_autophrase_samples:
    if l != '1':
        label = input(f'{sample}: ')
        phraseness_labeled_autophrase_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
    if sample in good_phrases:
        num_gp_positive += 1
pickle.dump(phraseness_labeled_autophrase_samples, open('data/phraseness_labeled_autophrase_samples.pkl', 'wb'))

potential applications:  1
stop sign:  1
in grid:  0
university of california irvine:  1
robust principal component:  1
hmm based:  0
fiber optic:  0
increasing attention:  0
rare disease:  1
empirically validate:  0
chemical process:  1
character level:  0
single nucleotide polymorphisms:  1
breaking news:  1
lung cancer:  1
group sparse:  0
task of classifying:  0
brain imaging:  1
local planning:  1
explicit knowledge:  1
level features:  0
excess risk:  1
primary care:  1
invasive species:  1
the past decade:  1
labeled and unlabeled:  0
frame prediction:  1
task specific:  0
small change:  1
few shot:  0
traffic congestion:  1
chemical species:  1
i d:  0
additional assumptions:  1
cost sensitive:  0
national park:  1
road rash:  1
powerful tools:  1
natural selection:  1
multiple classifiers:  11
vulnerable to adversarial examples:  0
human perception:  1
mathcal o:  0
margin based:  0
closely related:  0


In [10]:
phraseness_labeled_prdr_samples = []
num_hl_positive = 0
num_gp_positive = 0
for sample, l in labeled_prdr_samples:
    if l != '1':
        label = input(f'{sample}: ')
        phraseness_labeled_prdr_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
    if sample in good_phrases:
        num_gp_positive += 1
pickle.dump(phraseness_labeled_prdr_samples, open('data/phraseness_labeled_prdr_samples.pkl', 'wb'))

challenging reinforcement learning:  1
structured output problems:  1
single input image:  1
most preprocessing steps:  1
discrete speech code:  1
simple similarity classifier:  1
commercial web search:  1
unique minimizer b:  0
character level:  0
high level concepts:  1
method:  1
learning:  1
methods:  1
real user experience:  1
physical hexapod robot:  1
many perception tasks:  1
multi label:  0
user:  1
significant improvement:  1
higher level:  0
task:  1
different network architectures:  1
word level:  0
human evaluation study:  1
non convex:  0
multiple image datasets:  1
hierarchical recurrent sequence:  1
neural machine:  1
available training data:  1
hard matching problem:  1
recent output representations:  1
new variance reduction:  1
neural network sentence:  1
stochastic loss selection:  1
higher level inference:  1


In [16]:
phraseness_labeled_econ_samples = pickle.load(open('data/phraseness_labeled_econ_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in phraseness_labeled_econ_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'ECON Hand-Labeled Precision: {num_hl_positive / len(phraseness_labeled_econ_samples)}')
print(f'ECON Good Phrases Precision: {num_gp_positive / len(phraseness_labeled_econ_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

ECON Hand-Labeled Precision: 0.5581395348837209
ECON Good Phrases Precision: 0.0
['structural constraints', 'previous approaches', 'competitive inhibition', 'multiple myeloma', 'lancaster university']
['rank tensor', 'positive and negative', 'at large', 'recently gained', 'task oriented']


In [18]:
print(f'ECON Hand-Labeled Precision: {(num_hl_positive + 57) / 100}')
print(f'ECON Good Phrases Precision: {(num_gp_positive + 5) / 100}')

ECON Hand-Labeled Precision: 0.81
ECON Good Phrases Precision: 0.05


In [19]:
phraseness_labeled_autophrase_samples = pickle.load(open('data/phraseness_labeled_autophrase_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in phraseness_labeled_autophrase_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'AutoPhrase Hand-Labeled Precision: {num_hl_positive / len(phraseness_labeled_autophrase_samples)}')
print(f'AutoPhrase Good Phrases Precision: {num_gp_positive / len(phraseness_labeled_autophrase_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

AutoPhrase Hand-Labeled Precision: 0.5777777777777777
AutoPhrase Good Phrases Precision: 0.0
['excess risk', 'chemical species', 'rare disease', 'frame prediction', 'traffic congestion']
['multiple classifiers', 'mathcal o', 'closely related', 'margin based', 'increasing attention']


In [20]:
print(f'AutoPhrase Hand-Labeled Precision: {(num_hl_positive + 55) / 100}')
print(f'AutoPhrase Good Phrases Precision: {(num_gp_positive + 3) / 100}')

AutoPhrase Hand-Labeled Precision: 0.81
AutoPhrase Good Phrases Precision: 0.03


In [21]:
phraseness_labeled_prdr_samples = pickle.load(open('data/phraseness_labeled_prdr_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in phraseness_labeled_prdr_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'PRDR Hand-Labeled Precision: {num_hl_positive / len(phraseness_labeled_prdr_samples)}')
print(f'PRDR Good Phrases Precision: {num_gp_positive / len(phraseness_labeled_prdr_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

PRDR Hand-Labeled Precision: 0.8285714285714286
PRDR Good Phrases Precision: 0.08571428571428572
['high level concepts', 'neural network sentence', 'many perception tasks', 'physical hexapod robot', 'learning']
['multi label', 'higher level', 'character level', 'unique minimizer b', 'word level']


In [22]:
print(f'PRDR Hand-Labeled Precision: {(num_hl_positive + 65) / 100}')
print(f'PRDR Good Phrases Precision: {(num_gp_positive + 11) / 100}')

PRDR Hand-Labeled Precision: 0.94
PRDR Good Phrases Precision: 0.14
