In [1]:
from gensim.models import Word2Vec
import numpy as np
import os
import pickle
import re

In [2]:
re_concept_tagged = re.compile(
    r"<c>(?P<phrase>[^<]*)</c>"
)

In [3]:
def to_oneWord(w):
    return w.replace(' ', '_')

In [4]:
def to_concept_gensim(w):
    return '<c>%s</c>' % to_oneWord(w)

In [5]:
def to_concept_natural(w):
    # return w
    return re.sub(r'</?c>', '', w).replace('_', ' ')

In [6]:
def to_concept_natural_lower(w):
    return to_concept_natural(w.lower())

In [7]:
re_nonASCII = re.compile(r'[^\x00-\x7F]+')
wiki_concept_set = set()

AUTOPHRASE_PATH = '/Users/rishimasand/Documents/school/college/research/text_mining/AutoPhrase'

# load target_word_set
for l in open(os.path.join(AUTOPHRASE_PATH, 'data/EN/wiki_quality.txt')):
    concept = l.strip()
    if not re_nonASCII.search(l):
        wiki_concept_set.add(to_concept_natural_lower(concept).replace(' ', '_'))

target_concept_set = wiki_concept_set

In [8]:
def computeFeatures(concept, model):
    neighbor_word2sim = {to_concept_natural_lower(w): sim for w, sim in model.most_similar(concept, topn=50, restrict_vocab=None) if sim > 0.5}
    if len(neighbor_word2sim) < 2:
        return np.array([0, 0, 0, 0])

    meaningfulness = len(neighbor_word2sim)
    purity = np.mean(list(neighbor_word2sim.values()))

    targetness = len(set(neighbor_word2sim.keys()) & target_concept_set)

    completeness = -len([w for w in neighbor_word2sim.keys() if to_concept_natural_lower(concept) in w])

    return np.array([meaningfulness, purity, targetness, completeness])

In [9]:
data_dir = 'data'
input_file_name = 'arxiv_abstracts_10000.txt'
extensionless_input_file_name = input_file_name.split('.')[0]
input_file_path = f'{data_dir}/{input_file_name}'

model_save_path = f'{data_dir}/{extensionless_input_file_name}_embedding.bin'
concept_feature_bin_path = f'{data_dir}/{extensionless_input_file_name}_econ_feature.bin'
concept_feature_path = f'{data_dir}/{extensionless_input_file_name}_econ_feature.txt'

model = Word2Vec.load(model_save_path)

concept_feature_dict = {}

with open(concept_feature_path, 'w') as f_out:
    for i, w in enumerate(model.wv.index2word):
        if i % 10000 == 0:
            pickle.dump(concept_feature_dict, open(concept_feature_bin_path, 'wb'))
        if re_concept_tagged.match(w):
            concept_feature_dict[w] = computeFeatures(w, model)
            f_out.write('%s\t%s\n' % (to_concept_natural(w), concept_feature_dict[w]))

  neighbor_word2sim = {to_concept_natural_lower(w): sim for w, sim in model.most_similar(concept, topn=50, restrict_vocab=None) if sim > 0.5}
