In [1]:
import gensim
from gensim.models import word2vec, Word2Vec
import itertools
import json
import re
from smart_open import smart_open
import spacy

In [2]:
WINDOW_SIZE = 5

In [3]:
re_concept_tagged = re.compile(
    r"<c>(?P<phrase>[^<]*)</c>"
)

In [4]:
_nlp = spacy.load('en')

In [5]:
def to_oneWord(w):
    return w.replace(' ', '_')

In [6]:
def word2internal(raw_textual_unit):
    if not raw_textual_unit.istitle():
        raw_textual_unit = raw_textual_unit.lower()
    return raw_textual_unit.replace(' ', '_')

In [7]:
def trim_rule(word, count, min_count):
    if re_concept_tagged.match(word):
        return gensim.utils.RULE_KEEP
    return gensim.utils.RULE_DEFAULT

In [8]:
def to_concept_gensim(w):
    return '<c>%s</c>' % to_oneWord(w)

In [9]:
def to_concept_natural_lower(w):
    return to_concept_natural(w.lower())

In [10]:
def get_candidate_list(superspan):
    if superspan['tag'] == 'superspan':
        return [to_concept_gensim(span['text']) for span in superspan['spans']]
    else:
        return [superspan['text']]

In [11]:
def getNormalizedTextualUnits(superspan):
    textual_units_raw = get_candidate_list(superspan)
    textual_units_normalized = [word2internal(raw_textual_unit) for raw_textual_unit in textual_units_raw]

    return textual_units_normalized

In [12]:
re_nonLetter = re.compile('[^a-zA-Z]')

def removeNonLetter(doc, replaceWithSpace=False):
    if replaceWithSpace:
        doc = re.sub(re_nonLetter, ' ', doc)
    else:
        doc = re.sub(re_nonLetter, '', doc)
    # doc = ''.join(i for i in text if ord(i)<128)
    return doc

In [13]:
def get_list_of_candidateLists(superspan_sequence):
    superspan_sequence_removed_letters = [superspan for superspan in superspan_sequence if removeNonLetter(superspan['text'])]
    return [getNormalizedTextualUnits(superspan) for superspan in superspan_sequence_removed_letters]

In [14]:
class LineSuperWordSequenceAsWordPair(object):
    def __init__(self, source, limit=None):
        self.source = source
        self.limit = limit

    def __iter__(self):
        with smart_open(self.source) as fin:
            for line in itertools.islice(fin, self.limit):
                superspan_sequence = get_list_of_candidateLists(json.loads(line))
                for i in range(len(superspan_sequence)):
                    for j in range(i + 1, min(i + WINDOW_SIZE + 1, len(superspan_sequence))):
                        for candidate_i in superspan_sequence[i]:
                            for candidate_j in superspan_sequence[j]:
                                yield [candidate_i, candidate_j]

In [15]:
data_dir = 'data'
input_file_name = 'arxiv_abstracts_10000.txt'
extensionless_input_file_name = input_file_name.split('.')[0]
input_file_path = f'{data_dir}/{input_file_name}'

supersequence_path = f'{data_dir}/{extensionless_input_file_name}_superspan_sequence.json'

model_save_path = f'{data_dir}/{extensionless_input_file_name}_embedding.bin'

file = supersequence_path
model = Word2Vec(LineSuperWordSequenceAsWordPair(file), min_count=30, window=WINDOW_SIZE, sg=1, iter=5, workers=32, hs=1, negative=0, trim_rule=trim_rule)

model.save(model_save_path)