In [1]:
from gensim.models import Word2Vec
import pickle
import random

from common.constants import file_paths
from common.constants import file_suffixes
from common.constants import regex_patterns

In [2]:
CONCEPTS_SIZE = 800
SAMPLE_SIZE = 100

HAND_LABEL_ECON_SAMPLES = False
HAND_LABEL_AUTOPHRASE_SAMPLES = False
HAND_LABEL_PRDR_SAMPLES = False

In [3]:
good_phrases = set(pickle.load(open('data/positive_samples.pkl', 'rb')))
print(f'Loaded {len(good_phrases)} good phrases.')

Loaded 1900 good phrases.


In [4]:
def get_file_path(file_suffix):
    extensionless_input_file_name = file_paths.INPUT_FILE.split('.')[0]
    return f'{file_paths.DATA_DIR}/{extensionless_input_file_name}{file_suffix}'

In [5]:
econ_embedding_path = get_file_path(file_suffixes.ECON_EMBEDDING)
autophrase_results_path = get_file_path(file_suffixes.AUTOPHRASE_RESULTS)
prdr_results_path = get_file_path('_prdr_results.txt')

In [6]:
def get_raw_concept(tagged_econ_concept):
    return tagged_econ_concept.replace('_', ' ')[3:-4]

In [7]:
if HAND_LABEL_ECON_SAMPLES:
    model = Word2Vec.load(econ_embedding_path)
    econ_concepts = [get_raw_concept(w) for w in model.wv.index2word if regex_patterns.CONCEPT_TAGGED_PHRASE.match(w)]
    print(f'Loaded {len(econ_concepts)} ECON concepts.')
    econ_samples = random.sample(econ_concepts, SAMPLE_SIZE)

In [8]:
if HAND_LABEL_AUTOPHRASE_SAMPLES:
    with open(autophrase_results_path, 'r') as f:
        autophrase_concepts = f.read().split('\n')[:-1]
        f.close()
    print(f'Loaded {len(autophrase_concepts)} AutoPhrase concepts.')
    autophrase_samples = random.sample(autophrase_concepts, SAMPLE_SIZE)

In [9]:
if HAND_LABEL_PRDR_SAMPLES:
    prdr_concepts = [line.split('\t')[0] for line in open(prdr_results_path, 'r').read().split('\n')[:-1]]
    print(f'Loaded {len(prdr_concepts)} concepts extracted via PRDR.')
    prdr_samples = random.sample(prdr_concepts, SAMPLE_SIZE)

In [10]:
if HAND_LABEL_ECON_SAMPLES:
    labeled_econ_samples = []
    num_hl_positive = 0
    num_gp_positive = 0
    for sample in econ_samples:
        label = input(f'{sample}: ')
        labeled_econ_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
        if sample in good_phrases:
            num_gp_positive += 1
    pickle.dump(labeled_econ_samples, open('data/labeled_econ_samples.pkl', 'wb'))

In [11]:
if HAND_LABEL_AUTOPHRASE_SAMPLES:
    labeled_autophrase_samples = []
    num_hl_positive = 0
    num_gp_positive = 0
    for sample in autophrase_samples:
        label = input(f'{sample}: ')
        labeled_autophrase_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
        if sample in good_phrases:
            num_gp_positive += 1
    pickle.dump(labeled_autophrase_samples, open('data/labeled_autophrase_samples.pkl', 'wb'))

In [12]:
if HAND_LABEL_PRDR_SAMPLES:
    labeled_prdr_samples = []
    num_hl_positive = 0
    num_gp_positive = 0
    for sample in prdr_samples:
        label = input(f'{sample}: ')
        labeled_prdr_samples.append((sample, label))
        if label == '1':
            num_hl_positive += 1
        if sample in good_phrases:
            num_gp_positive += 1
    pickle.dump(labeled_prdr_samples, open('data/labeled_prdr_samples.pkl', 'wb'))

In [13]:
labeled_econ_samples = pickle.load(open('data/labeled_econ_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in labeled_econ_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'ECON Hand-Labeled Precision: {num_hl_positive / len(labeled_econ_samples)}')
print(f'ECON Good Phrases Precision: {num_gp_positive / len(labeled_econ_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

ECON Hand-Labeled Precision: 0.57
ECON Good Phrases Precision: 0.05
['inverse problem', 'boosted trees', 'memory management', 'principal components analysis', 'web applications']
['left ventricle', 'irrelevant features', 'outperform existing', 'starting point', 'mid level']


In [14]:
labeled_autophrase_samples = pickle.load(open('data/labeled_autophrase_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in labeled_autophrase_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'AutoPhrase Hand-Labeled Precision: {num_hl_positive / len(labeled_autophrase_samples)}')
print(f'AutoPhrase Good Phrases Precision: {num_gp_positive / len(labeled_autophrase_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

AutoPhrase Hand-Labeled Precision: 0.55
AutoPhrase Good Phrases Precision: 0.03
['motion planning', 'predictive power', 'theoretical foundation', 'shared memory', 'handwriting recognition']
['brain imaging', 'i d', 'level features', 'road rash', 'labeled and unlabeled']


In [15]:
labeled_prdr_samples = pickle.load(open('data/labeled_prdr_samples.pkl', 'rb'))
positive = []
negative = []
num_hl_positive = 0
num_gp_positive = 0
for ls in labeled_prdr_samples:
    if ls[1] == '1' or ls[0] in good_phrases:
        positive.append(ls[0])
    else:
        negative.append(ls[0])
    if ls[1] == '1':
        num_hl_positive += 1
    if ls[0] in good_phrases:
        num_gp_positive += 1
print(f'PRDR Hand-Labeled Precision: {num_hl_positive / len(labeled_prdr_samples)}')
print(f'PRDR Good Phrases Precision: {num_gp_positive / len(labeled_prdr_samples)}')
print(random.sample(positive, 5))
print(random.sample(negative, 5))

PRDR Hand-Labeled Precision: 0.65
PRDR Good Phrases Precision: 0.11
['recursive matching structure', 'mnist', 'high level abstractions', 'methods', 'domain']
['different network architectures', 'recent output representations', 'stochastic loss selection', 'significant improvement', 'neural network sentence']


In [16]:
labeled_samples = labeled_econ_samples + labeled_autophrase_samples + labeled_prdr_samples

In [17]:
print('Hand-Labeled Good but not Ground Truth\n')
for ls in labeled_samples:
    if ls[1] == '1' and ls[0] not in good_phrases:
        print(ls[0])

Hand-Labeled Good but not Ground Truth

open set recognition
smooth function
boosted trees
geometric brownian motion
probabilistic program
estimation error
cifar 100
click through rate
standard deviation
long term memory
virtual worlds
nlp applications
group lasso
minimum spanning tree
disentangled representations
inverse problem
sampling distribution
markov random field
null hypothesis
joint probability
mixture components
performance gain
kernel approximation
deep reinforcement learning rl
functional analysis
restricted boltzmann machine
recognition tasks
novelty detection
potts model
predictive analytics
web browsers
principal components analysis
disjoint sets
training instances
cutting plane
pattern mining
weak classifiers
deep q networks
training phase
hyperbolic space
target dataset
symbolic computation
image datasets
configuration space
computational overhead
submodular optimization
control problems
web applications
moment matching
np hard
imagenet classification
graph database
p

In [18]:
print('Ground Truth but not Hand-Labeled Good\n')
for ls in labeled_samples:
    if ls[1] == '0' and ls[0] in good_phrases:
        print(ls[0])

Ground Truth but not Hand-Labeled Good

learning
methods
task
