In [1]:
from collections import defaultdict
from gensim.models import Word2Vec
import json
import numpy as np
import pickle
import re
from tqdm import tqdm

In [2]:
data_dir = 'data'
input_file_name = 'arxiv_abstracts_10000.txt'
extensionless_input_file_name = input_file_name.split('.')[0]
input_file_path = f'{data_dir}/{input_file_name}'

supersequence_path = f'{data_dir}/{extensionless_input_file_name}_superspan_sequence.json'
model_save_path = f'{data_dir}/{extensionless_input_file_name}_embedding.bin'
concept_feature_path = f'{data_dir}/{extensionless_input_file_name}_econ_feature.txt'
concept_score_path = f'{data_dir}/{extensionless_input_file_name}_score_list.bin'

concept_representation_path = f'{data_dir}/{extensionless_input_file_name}_concept_representation.txt'

In [3]:
re_concept_tagged = re.compile(
    r"<c>(?P<phrase>[^<]*)</c>"
)

In [4]:
model = Word2Vec.load(model_save_path)
concept_list = [w for w in model.wv.index2word if re_concept_tagged.match(w)]

concept_lowered2score = defaultdict(lambda :1)

try:
    concept_score_list = pickle.load(open(concept_score_path, 'r'))
    concept2score = dict(zip(concept_list[:len(concept_score_list)], concept_score_list))
    concept_lowered2score = {c.lower(): max([s for c, s in c_s]) for c, c_s in
                             groupby(sorted(concept2score.items(), key=lambda t: t[0]), key=lambda t: t[0])}
except Exception as e:
    pass

In [5]:
vocab_lower = {k.lower():v for k,v in model.wv.vocab.items()}
concept_lower2Concept = {w:model.wv.index2word[vocab_lower[w].index] for w in vocab_lower}

In [6]:
with open(supersequence_path) as fin:
    supersequences = [i for i in fin]

In [7]:
re_nonLetter = re.compile('[^a-zA-Z]')

def removeNonLetter(doc, replaceWithSpace=False):
    if replaceWithSpace:
        doc = re.sub(re_nonLetter, ' ', doc)
    else:
        doc = re.sub(re_nonLetter, '', doc)
    # doc = ''.join(i for i in text if ord(i)<128)
    return doc

In [8]:
def to_oneWord(w):
    return w.replace(' ', '_')

In [9]:
def to_concept_gensim(w):
    return '<c>%s</c>' % to_oneWord(w)

In [10]:
def get_candidate_list(superspan):
    if superspan['tag'] == 'superspan':
        return [to_concept_gensim(span['text']) for span in superspan['spans']]
    else:
        return [superspan['text']]

In [11]:
def word2internal(raw_textual_unit):
    if not raw_textual_unit.istitle():
        raw_textual_unit = raw_textual_unit.lower()
    return raw_textual_unit.replace(' ', '_')

In [12]:
def getNormalizedTextualUnits(superspan):
    textual_units_raw = get_candidate_list(superspan)
    textual_units_normalized = [word2internal(raw_textual_unit) for raw_textual_unit in textual_units_raw]
    return textual_units_normalized

In [13]:
def get_cleaned_superspan_sequence(superspan_sequence):
    superspan_sequence_removed_letters = [superspan for superspan in superspan_sequence if removeNonLetter(superspan['text'])]
    return superspan_sequence_removed_letters

In [14]:
def to_concept_natural(w):
    # return w
    return re.sub(r'</?c>', '', w).replace('_', ' ')

In [15]:
def to_concept_natural_lower(w):
    return to_concept_natural(w.lower())

In [16]:
def getIsDominatedScore(sequence, superspan_sequence, model=model):
    score = 0
    for concept, superspan in zip(sequence, superspan_sequence):
        if re_concept_tagged.match(concept):
            concept = concept.lower()
            covered_concepts = set()
            try:
                covered_neighbor_word2sim = {covered_concept.lower(): sim for covered_concept, sim in model.most_similar(model.wv.index2word[vocab_lower[concept1].index], topn=TOPN, restrict_vocab=restrict_vocab, partition_only=True) if sim > BASIC_THRESHOLD and to_concept_natural_lower(concept) in to_concept_natural_lower(covered_concept)}
                for other_concept in getNormalizedTextualUnits(superspan):
                    if other_concept.lower() in covered_neighbor_word2sim:
                        covered_concepts.add(other_concept)
                        continue

                score += len(covered_concepts)
            except Exception as e:
                continue

    return score

In [17]:
def getNormalizedLengthScore(sequence, superspan_sequence):
    score = sum([len(to_concept_natural_lower(w).split('_')) / float(len(superspan['text'].split())) for w, superspan in zip(sequence, superspan_sequence)])
    return score

In [18]:
def getConceptQualityScore(sequence):
    score = sum([concept_lowered2score.get(w.lower(), 0) for w in sequence])
    # todo: add length rewards
    return score

In [19]:
def normalize(array):
    if not isinstance(array, np.ndarray):
        array = np.array(array, dtype=np.float)
    return (array - np.min(array)) / (np.max(array) - np.min(array) + np.finfo(float).eps)

In [20]:
def getEndsWithScore(sequence, superspan_sequence):
    score = sum([1 if to_concept_natural_lower(superspan['text']).endswith(to_concept_natural_lower(span).split('_')[-1]) else 0 for span, superspan in zip(sequence, superspan_sequence)])
    return score

In [21]:
IS_DOMINATED_COEFF = -20

def select_best(superspan_sequence, ALPHA = .5, BETA = .51, GAMMA = 10, model=model):
    # combine all choices and score each one, select best
    possible_sequence_bylength = defaultdict(list)
    for span in getNormalizedTextualUnits(superspan_sequence[0]):
        possible_sequence_bylength[0] += [[span]]
    for i in range(1, len(superspan_sequence)):
        current_superSpan = superspan_sequence[i]
        # todo: take in non overlapping textual units within same superspan
        for span in getNormalizedTextualUnits(current_superSpan):
            possible_sequence_bylength[i] += [previous_possible_sequence + [span] for previous_possible_sequence in possible_sequence_bylength[i-1]]
    possible_sequences = possible_sequence_bylength[len(superspan_sequence) - 1]
    model_scores = normalize(model.score(possible_sequences))

    # is add, because is computing negative log likelihood

    # if two words are similar in the same span, the contained words are dominated and will not be selected
    concept_quality_scores = normalize([getConceptQualityScore(possible_sequence) for possible_sequence in possible_sequences])
    concept_length_scores = normalize([getNormalizedLengthScore(possible_sequence, superspan_sequence) for possible_sequence in possible_sequences])
    concept_endswith_scores = normalize([getEndsWithScore(possible_sequence, superspan_sequence) for possible_sequence in possible_sequences])
    concept_is_dominated_scores = normalize([getIsDominatedScore(possible_sequence, superspan_sequence, model) for possible_sequence in possible_sequences])

    scores = model_scores + ALPHA * concept_quality_scores + BETA * concept_length_scores + GAMMA * concept_endswith_scores + IS_DOMINATED_COEFF * concept_is_dominated_scores

    original_sent = ' '.join(superspan['text'] for superspan in superspan_sequence)
#     print('\n'.join(['%s %s %s %s %s %s %s' % t for t in zip(possible_sequences, model_scores, concept_quality_scores, concept_length_scores, concept_endswith_scores, concept_is_dominated_scores, scores)]))
#     print(original_sent)
#     print(sorted(zip(possible_sequences, scores), key=lambda x: x[1], reverse=True)[0][0])

    return sorted(zip(possible_sequences, scores), key=lambda x: x[1], reverse=True)[0][0]

In [22]:
ALPHA = .5
BETA = .51
GAMMA = 10
MAX_CHOICES = 5000

def process_superspan_sequence(superspan_sequence, ALPHA=.5, BETA=.51, GAMMA=10, model=model):
    superspan_sequence = get_cleaned_superspan_sequence(superspan_sequence)
    recognized_spans = []

    current_start = 0
    num_current_choices = 1

    for i, current_superSpan in enumerate(superspan_sequence):
        # compute max. choice
        num_current_choices *= len(getNormalizedTextualUnits(current_superSpan))

        # if MAX_CHOICES is reached
        if num_current_choices >= MAX_CHOICES:
            # score all sentence, merge into result
            recognized_spans += select_best(superspan_sequence[current_start:i + 1], ALPHA=ALPHA, BETA=BETA, GAMMA=GAMMA, model=model)
            current_start = i + 1
            num_current_choices = 1

    if superspan_sequence[current_start:]:
        recognized_spans += select_best(superspan_sequence[current_start:], model=model)

    return ' '.join(recognized_spans)

In [23]:
indexes = range(len(supersequences))

model.init_sims()

with open(concept_representation_path, 'w') as f:
    for i in tqdm(indexes):
        segmentation = process_superspan_sequence(json.loads(supersequences[i]), model=model)
        f.write(segmentation + '\n')

100%|██████████| 10000/10000 [01:00<00:00, 164.98it/s]
