# AutoPhrase vs. ECON Extracted Concept Results Comparison

In [1]:
from gensim.models import Word2Vec

from common.constants import file_paths
from common.constants import file_suffixes
from common.constants import regex_patterns

In [2]:
def get_file_path(file_suffix):
    extensionless_input_file_name = file_paths.INPUT_FILE.split('.')[0]
    return f'{file_paths.DATA_DIR}/{extensionless_input_file_name}{file_suffix}'

In [3]:
econ_embedding_path = get_file_path(file_suffixes.ECON_EMBEDDING)
autophrase_results_path = get_file_path(file_suffixes.AUTOPHRASE_RESULTS)

In [4]:
with open(autophrase_results_path, 'r') as f:
    autophrase_concepts = f.read().split('\n')[:-1]
    f.close()
print(f'Loaded {len(autophrase_concepts)} AutoPhrase concepts.')

Loaded 4413 AutoPhrase concepts.


In [5]:
def get_raw_concept(tagged_econ_concept):
    return tagged_econ_concept.replace('_', ' ')[3:-4]

In [6]:
model = Word2Vec.load(econ_embedding_path)
econ_concepts = [get_raw_concept(w) for w in model.wv.index2word if regex_patterns.CONCEPT_TAGGED_PHRASE.match(w)]
print(f'Loaded {len(econ_concepts)} ECON concepts.')

Loaded 4473 ECON concepts.


In [7]:
autophrase_concept_set = set(autophrase_concepts)
econ_concept_set = set(econ_concepts)

## Difference Comparison

In [8]:
print(f'{len(econ_concept_set - autophrase_concept_set)} ECON concepts not found in AutoPhrase concepts.')

96 ECON concepts not found in AutoPhrase concepts.


In [9]:
list(econ_concept_set - autophrase_concept_set)

['computational performance',
 'ability to capture',
 'data efficient',
 'supervised approaches',
 'bayes classifier',
 'structured learning',
 'ensemble approach',
 'convex problems',
 'complex models',
 'challenging problem',
 'proposed algorithm',
 'based method',
 'task performance',
 'connectionist temporal classification',
 'non convex optimization',
 'a long standing',
 'learning objective',
 'classification loss',
 'connected layers',
 'generative framework',
 'deterministic policy',
 'difficult to train',
 'problems in machine learning',
 'specific knowledge',
 'learning task',
 'and vice versa',
 'pca based',
 'score based',
 'temporal data',
 'machine learning models',
 'number of model parameters',
 'curriculum learning',
 'accuracy and computational',
 'dimensional representation',
 'learning strategy',
 'simple and effective',
 'based regularization',
 'prediction results',
 'regression and classification',
 'graphical models',
 'deep model',
 'number of training samples'

In [10]:
print(f'{len(autophrase_concept_set - econ_concept_set)} AutoPhrase concepts not found in ECON concepts.')

36 AutoPhrase concepts not found in ECON concepts.


In [11]:
list(autophrase_concept_set - econ_concept_set)

['optimal performance',
 'next frame',
 'recommender system',
 'traditional machine learning',
 'efficient solutions',
 'high computational',
 'norm based',
 'o log',
 'mathcal f',
 'provide insights',
 'dynamic bayesian',
 'ell 2 norm',
 'function estimation',
 'approach combines',
 'cnns trained',
 'structured sparse',
 'input distribution',
 'pre specified',
 'annotated training',
 'training images',
 'noise regime',
 'top down',
 'singular value',
 'handle large',
 'proposed method achieves',
 'inductive learning',
 'second order statistics',
 'global information',
 'renewed interest',
 'action classification',
 'sparse gaussian',
 'framework outperforms',
 '100 000',
 'bf x',
 'improve accuracy',
 'publicly available']

## Sub/Super-Sequence Comparison

In [12]:
def concept_contains_words_in_order(concept, words):
    concept_words = set(concept.split())
    for word in words:
        if word not in concept_words:
            return False
    return ' '.join(words) in concept

In [13]:
def concept_contains_concept(c1, c2):
    return concept_contains_words_in_order(c1, c2.split())

In [14]:
def print_sub_super_sequence_comparison(results, for_econ):
    first_method = 'AutoPhrase'
    second_method = 'ECON'
    if for_econ:
        first_method = 'ECON'
        second_method = 'AutoPhrase'
    for result in results:
        is_good_concept, c1, c2, is_subsequence = result
        good_concept_text = 'BAD'
        if is_good_concept:
            good_concept_text = 'GOOD'
        sequence_text = 'SUPERSEQUENCE'
        if is_subsequence:
            sequence_text = 'SUBSEQUENCE'
        print(f'{good_concept_text : <5} | {first_method} Concept: {c1 : <40} | {second_method} concept: {c2 : <40} | {sequence_text}')

### ECON

Identify each ECON concept, `ec`, such that `ec` is a non-matching subsequence/supersequence of an AutoPhrase concept, `ac`, given that AutoPhrase did not extract `ec`.

In [15]:
results = []
for ac in autophrase_concepts:
    for ec in econ_concepts:
        if ec != ac and ec not in autophrase_concept_set:
            if concept_contains_concept(ac, ec):
                is_good_concept = input(ec)
                results.append((is_good_concept == '1', ec, ac, True))
            elif concept_contains_concept(ec, ac):
                is_good_concept = input(ec)
                results.append((is_good_concept == '1', ec, ac, False))

temporal data 1
computer vision and machine learning 0
problems in machine learning 0
machine learning models 1
synthetic and real world datasets 1
simulated and real world data 1
non convex optimization 1
based optimization 0
deep recurrent neural network 1
real life data 1
publicly available datasets 1
simulated and real world data 1
graphical models 1
synthetic and real world datasets 1
computer vision and machine learning 0
deep recurrent neural network 1
and vice versa 0
connected layers 1
each data point 0
simulated and real world data 1
computer aided 0
low dimensional representations 1
graphical models 1
simulated and real world data 1
number of training samples 0
bayes classifier 1
synthetic and real world datasets 1
graphical models 1
deep recurrent neural network 1
large data 0


In [16]:
print_sub_super_sequence_comparison(results, for_econ=True)

GOOD  | ECON Concept: temporal data                            | AutoPhrase concept: spatio temporal data                     | SUBSEQUENCE
BAD   | ECON Concept: computer vision and machine learning     | AutoPhrase concept: machine learning                         | SUPERSEQUENCE
BAD   | ECON Concept: problems in machine learning             | AutoPhrase concept: machine learning                         | SUPERSEQUENCE
GOOD  | ECON Concept: machine learning models                  | AutoPhrase concept: machine learning                         | SUPERSEQUENCE
GOOD  | ECON Concept: synthetic and real world datasets        | AutoPhrase concept: real world                               | SUPERSEQUENCE
GOOD  | ECON Concept: simulated and real world data            | AutoPhrase concept: real world                               | SUPERSEQUENCE
GOOD  | ECON Concept: non convex optimization                  | AutoPhrase concept: convex optimization                      | SUPERSEQUENCE
BAD   | 

### AutoPhrase

Identify each AutoPhrase concept, `ac`, such that `ac` is a non-matching subsequence/supersequence of an ECON concept, `ec`, given that ECON did not extract `ac`.

In [17]:
results = []
for ec in econ_concepts:
    for ac in autophrase_concepts:
        if ac != ec and ac not in econ_concept_set:
            if concept_contains_concept(ec, ac):
                is_good_concept = input(ac)
                results.append((is_good_concept == '1', ac, ec, True))
            elif concept_contains_concept(ac, ec):
                is_good_concept = input(ac)
                results.append((is_good_concept == '1', ac, ec, False))

traditional machine learning 1
proposed method achieves 0
proposed method achieves 0
ell 2 norm 1
publicly available 0
high computational 0
dynamic bayesian 0
singular value 1
singular value 1


In [18]:
print_sub_super_sequence_comparison(results, for_econ=False)

GOOD  | AutoPhrase Concept: traditional machine learning             | ECON concept: machine learning                         | SUPERSEQUENCE
BAD   | AutoPhrase Concept: proposed method achieves                 | ECON concept: proposed method                          | SUPERSEQUENCE
BAD   | AutoPhrase Concept: proposed method achieves                 | ECON concept: method achieves                          | SUPERSEQUENCE
GOOD  | AutoPhrase Concept: ell 2 norm                               | ECON concept: ell 2                                    | SUPERSEQUENCE
BAD   | AutoPhrase Concept: publicly available                       | ECON concept: publicly available datasets              | SUBSEQUENCE
BAD   | AutoPhrase Concept: high computational                       | ECON concept: high computational cost                  | SUBSEQUENCE
BAD   | AutoPhrase Concept: dynamic bayesian                         | ECON concept: dynamic bayesian networks                | SUBSEQUENCE
GOOD  | Auto