In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import csv

def load_csv(filepath):
    with open(filepath, newline='') as csvfile:
        return list(csv.DictReader(csvfile))

In [None]:
csv_data = load_csv('../data/BBairline200722_coreffed.csv')

In [None]:
single_quote_unicode = ord("'")
translation_table_text = str.maketrans(
    {
        '`': single_quote_unicode,
        '‘': single_quote_unicode,
        '’': single_quote_unicode,
        '“': single_quote_unicode,
        '”': single_quote_unicode,
    }
)
     
corpus_texts_full, corpus_titles_full = [], []
for row in csv_data:
    text, title = row['text'].translate(translation_table_text), row['title']
    corpus_texts_full.append(text)
    corpus_titles_full.append(title)
    

In [None]:
import spacy
from spacy.language import Language
from spacy.lang.en import STOP_WORDS

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("merge_entities")
# nlp.add_pipe("merge_noun_chunks")
nlp.pipe_names

In [None]:
import tqdm
from typing import NamedTuple

tag_id_map = {}

def update_tag_id_map(keys):
    for key in keys:
        if tag_id_map.get(key) == None:
            tag_id_map[key] = len(tag_id_map)


class TokenData(NamedTuple):
    text: str
    lower_: str
    lemma_: str
    ent_type_: str
    pos_: str
    tag_: str
    is_punct: bool
    is_space: bool
    is_stop: bool


def corpus2tokens(corpus_text, *args, **kwargs):
    nlp_pipe = nlp.pipe(tqdm.notebook.tqdm(corpus_text), *args, **kwargs)
    return [doc2tokens(doc_id, doc) for doc_id, doc in enumerate(nlp_pipe)]


def doc2tokens(doc_id, doc):
    update_tag_id_map((doc_id,))
    return {
        'doc_id': doc_id,
        'tokens': [
            TokenData(
                text=token.text,
                lower_=token.lower_,
                lemma_=token.lemma_,
                ent_type_=token.ent_type_,
                pos_=token.pos_,
                tag_=token.tag_,
                is_punct=token.is_punct,
                is_space=token.is_space,
                is_stop=token.is_stop,
            )
            for token in doc
        ]
    }

# def doc2tokens(doc_id, doc):
#     return [sent2tokens(doc_id, sent_id, sent) for sent_id, sent in enumerate(doc.sents)]


# def sent2tokens(doc_id, sent_id, sent):
#     compound_sent_id = (doc_id, sent_id,)
#     update_tag_id_map((doc_id, compound_sent_id,))
#     return {
#         'sent_id': compound_sent_id,
#         'tokens': [
#             TokenData(
#                 text=token.text,
#                 lower_=token.lower_,
#                 lemma_=token.lemma_,
#                 ent_type_=token.ent_type_,
#                 pos_=token.pos_,
#                 tag_=token.tag_,
#                 is_punct=token.is_punct,
#                 is_space=token.is_space,
#             )
#             for token in sent
#         ]
#     }

In [None]:
%%time
corpus_token_objects = corpus2tokens(corpus_texts_full)

In [None]:
corpus_token_objects[0]

In [None]:
for i in corpus_token_objects[0]['tokens']:
    print((i.text, i.ent_type_))

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# def gen_tagged_docs_by_sent(token_objects, process_tokens_func, process_tags_func):
#     tagged_docs = []A
#     for doc in token_objects:
#         for sent in doc:
#             sent_tokens = process_tokens_func(sent)
#             sent_tags = process_tags_func(sent)
#             tagged_docs.append(TaggedDocument(sent_tokens, sent_tags))
#     return tagged_docs

def gen_tagged_docs_by_doc(token_objects, process_tokens_func, process_tags_func):
    tagged_docs = []
    for doc in token_objects:
        doc_tokens = process_tokens_func(doc)
        doc_tags = process_tags_func(doc)
        tagged_docs.append(TaggedDocument(doc_tokens, doc_tags))
    return tagged_docs

In [None]:
# def process_tokens_sent(sent):
#     return [token.lower_ for token in sent['tokens'] if not (token.is_punct or token.is_space)]

# def process_tags_sent(sent):
#     doc_id, sent_id = sent['sent_id']
#     return [tag_id_map[doc_id], tag_id_map[doc_id, sent_id]]

def process_tokens_doc(doc):
    return [token.lower_ for token in doc['tokens'] if not (token.is_punct or token.is_space)]

def process_tags_doc(doc):
    return [tag_id_map[doc['doc_id']]]


In [None]:
# corpus_full = gen_tagged_docs_by_sent(corpus_token_objects, process_tokens_sent, process_tags_sent)

In [None]:
corpus_full = gen_tagged_docs_by_doc(corpus_token_objects, process_tokens_doc, process_tags_doc)

In [None]:
corpus_full[48]

In [None]:
# https://groups.google.com/g/gensim/c/6JmSsx4iIv0
# projects with larger vocabularies tend to lean more towards negative-sampling than hierarchical-softmax
# VERY NB - https://stackoverflow.com/a/37502976/1782641
# https://radimrehurek.com/gensim/models/doc2vec.html
model = Doc2Vec(
    vector_size=300,
    epochs=200,
    min_count=10,
    window=10,
    hs=0,
    negative=20,
    sample=1e-3,
    workers=3  # 64
)

In [None]:
%%time
model.build_vocab(corpus_full)

In [None]:
print(f"Word 'airport' appeared {model.wv.get_vecattr('airport', 'count')} times in the full corpus.")

In [None]:
%%time
model.train(corpus_full, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.save("./doc2vec.model")

In [None]:
wv = model.wv
wv.save('./doc2vec.wv')

In [None]:
def corpus_to_dicts(corpus):
    for doc in corpus:
        yield {
            'tokens': doc.words,
            'tags': doc.tags
        }

In [None]:
import simplejson


def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
json_save(corpus_to_dicts(corpus_full), './doc2vec.corpus.full.json')

In [None]:
id_tag_map = tag_id_map.keys()

In [None]:
json_save(id_tag_map, './doc2vec.id_tag_map.json')

In [None]:
json_save(corpus_token_objects, './doc2vec.corpus_token_objects.json')