In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import simplejson

def json_load(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return simplejson.load(f)

In [None]:
%%time
corpus_token_objects = []

# # pick 'fine-grained-ner.2021-02-11' and 'BBairline200722_coreffed' to start with, but compare results later
# datasets = ('BBairline200722', 'BBairline200722_coreffed')
# predictor_models = ('ner-model-2020.02.10', 'fine-grained-ner.2021-02-11', 'fgner-transformer.2021-02-11',)
dataset = 'BBairline200722_coreffed'
predictor = 'fine-grained-ner.2021-02-11'
# possible_ents = [
#     'TIME',
#     'ORDINAL',
#     'MONEY',
#     'DATE',
#     'PERSON',
#     'WORK_OF_ART',
#     'NORP',
#     'ORG',
#     'QUANTITY',
#     'EVENT',
#     'LANGUAGE',
#     'LOC',
#     'GPE',
#     'O',
#     'PERCENT',
#     'FAC',
#     'LAW',
#     'CARDINAL',
#     'PRODUCT'
# ]

def get_corrected_ent_type(token, current_ent_type):
    hardcoded_ents = {
        'covid': 'MISC',
        'coronavirus': 'MISC',
        'quicktake': 'PRODUCT',
        'united-states': 'GPE',
        'united-kingdom': 'GPE',
        'germany': 'GPE',
        'belarus': 'GPE',
        'Belarusian': 'NORP',
        'hawaii': 'GPE',
        'american': 'NORP',
        'european': 'NORP',
        'union': 'NORP',
        'ryanair': 'ORG',
        'delta': 'MISC',
        'europe': 'LOC',
        'vaccinated': 'MISC',
        'vaccination': 'MISC',
        'vaccinations': 'MISC',
        'vaccine': 'MISC',
        'virus': 'MISC',
        'quarantine': 'MISC',
        'lockdown': 'MISC',
        'mutation': 'MISC',
        'mutations': 'MISC',
        'europe': 'LOC',
        'variants': 'MISC',
        'variant': 'MISC',
        'israel': 'GPE',
        'africa': 'LOC',
        'europe': 'LOC',
        'asia': 'LOC',
        'russia': 'GPE',
    }
    return (token, hardcoded_ents.get(token) or current_ent_type,)

def normalize(token, ent):
    if len(token) == 1:
        return []
    normalized_token = ' '.join(
        token.lower()
        .replace("&", 'and')
        .replace("'s", ' ')
        .replace("n't", 'not')
        .replace("'d", ' ')
        .replace("'ll", ' ')
        .replace("'re", ' ')
        .replace("'m", ' ')
        .replace("'v", ' ')
        .replace("-", ' ')
        .replace("—", ' ')
        .replace("u.k.", 'united-kingdom')
        .replace("u.s.", 'united-states')
        .replace("covid19", 'covid')
        .replace("covid 19", 'covid')
        .replace(".", ' ')
        .replace("'", ' ')        
        .replace('"', ' ')
        .replace(',', ' ')
        .replace(':', ' ')
        .replace('?', ' ')
        .replace("u s", 'united-states')
        .split()
    )
    if len(normalized_token) > 100 or (
        len(normalized_token) > 20 and (
            ent == 'WORK_OF_ART' or ent == 'O'
        )
    ):
        # e.g.
        # >> more state department warns of passport backlog of up to 18 weeks sara dahiya <<
        # >> how safe is flying in the age of coronavirus quicktake united-states airlines vow refunds for people turned away for fever american reached out to cruz about not wearing mask on flight covid can travel 26 feet at cold meat plants with stale air <<
        # >> megaphone fm the labor episode how omni hotels and resorts <<
        # >> pic twitter com/l5ggijtdga <<
        # >> pic twitter com/4ae76doz0 <<
        # >> https //t co/eboys3vm65#roadtocarbonneutrality <<
        # >> pic twitter com/shwctqazqy <<
        # >> https //t co/h4cieiuzec@a4europe <<
        # >> pic twitter com/scabbcimay <<
        # >> https //t co/uqvjc2jtmt <<
        # >> pic twitter com/uvdtqnjqkd <<
        # >> key developments big read attending her last g 7 <<
        # >> germany recommendation <<
        # >> exist https //t co/rquqibruer <<
        # >> united-states united-kingdom <<
        # >> covid resilience ranking <<
        # >> @dailymailukhttps //t co <<
        # >> odd lots tracy alloway tracy alloway <<
        # >> more global economy could suffer $ 4 trillion loss on tourism drop ( 1 ) <<
        # >> hawaii islands under the influence <<
        # >> quicktake the future of travel in the covid era <<
        # >> pic twitter com/mwco6qeity <<
        # >> more airlines flying around belarus face delays and higher fuel cost speaking <<
        # >> ryanair posts record loss expects to break even this year <<
        # >> europe threatened with the delta strain now faces slower vaccination rates from alpha to the delta strain why virus mutations cause alarm quicktake covid deaths reach 4 million <<
        # >> the world for sale money power and the traders who barter the earth resources <<
        # >> the european union could tax carbon around world quicktake the european union plans to bring carbon trading to shipping heating transport airlines to be charged more for polluting in the european union green push want to end flying shame meet sustainable jet fuel quicktake eu what is flying shame is flying shame a movement with legs quicktake eu to urge 2035 goal to end combustion engine era in autos the continent needs to double renewable power to hit climate goal ceos join the european union <<
        # >> psmith@bloomberglaw com <<
        # >> jcasuga@bloomberglaw com <<
        # >> aharris@bloomberglaw com <<
        # >> united-kingdom virus out of control ; cuomo warns of spread virus update europe moves to isolate united-kingdom <<
        # >> world best bars list la factoría <<
        # >> summer dreams at risk on blanket no go advice <<
        # >> read more the economy risks tragic scenario <<
        # >> https //t co/m7hvwcv9ms <<
        # >> covid resilience ranking <<
        # >> https //t co/biluulc0kq <<
        # >> pic twitter com/4cplc24nzs <<
        # >> pic twitter com/watwmfogqh <<
        # >> pic twitter com/pqghkvjfwl <<
        # >> federal reserve says economy strengthening amid disruptions labor shortages following <<
        # >> where can you fly right now tracking the return to the skies why the mutated coronavirus variants are so worrisome quicktake israel <<
        # >> lbyington@bloombergindustry com <<
        # >> hlowenkron@bloomberg net <<
        # >> soon https //t co/awypdmmagj <<
        # >> pic twitter com/pk0brbtzyk <<
        # >> https //apnews com/hub/coronavirus pandemic <<
        # >> https //apnews com/hub/coronavirus vaccine <<
        # >> united-states capitol riot why presidential pardons are normal <<
        normalized_tokens = [(n, ent,) for n in normalized_token.split()]
    else:
        normalized_tokens = [(normalized_token, ent,)]
    return normalized_tokens

def corpus2tokens(corpus):
    tokens = []
    for doc in corpus:
        doc_tokens = []
        for token, ent in doc:
            # normalized_tokens can return a list of more than one element in cases where incorrect merging has taken place
            normalized_tokens = normalize(token, ent)
            for normalized_token, normalized_ent in normalized_tokens:
                if normalized_token:
                    if any(s in normalized_token for s in ['@', '/', 'http']):
                        pass
                        # print('>>', normalized_token, normalized_ent, '<<')
                        # >> august / september DATE <<
                        # >> t 665/20 O <<
                        # >> kremlin pess service / handout / anadolu agency ORG <<
                        # >> com/l5ggijtdga O <<
                        # >> abc / bloomberg ) deutsche bank ag ORG <<
                        # >> com/4ae76doz0 O <<
                        # >> https O <<
                        # >> //t O <<
                        # >> co/eboys3vm65#roadtocarbonneutrality O <<
                        # >> com/shwctqazqy O <<
                        # >> https O <<
                        # >> //t O <<
                        # >> co/h4cieiuzec@a4europe O <<
                        # >> co/h4cieiuzec@a4europe O <<
                        # >> @cansoeurope O <<
                        # >> @asdeurope O <<
                        # >> @eraaorg O <<
                        # >> com/scabbcimay O <<
                        # >> https O <<
                        # >> //t O <<
                        # >> co/uqvjc2jtmt O <<
                        # >> com/uvdtqnjqkd O <<
                        # >> 6kg / kwh QUANTITY <<
                        # >> @middleeast O <<
                        # >> @nexta_en ORG <<
                        # >> 9/11 O <<
                        # >> https O <<
                        # >> //t O <<
                        # >> co/rquqibruer O <<
                        # >> @dailymailukhttps O <<
                        # >> @dailymailukhttps O <<
                        # >> //t O <<
                        # >> 5267/200a CARDINAL <<
                        # >> 5268/200r ) QUANTITY <<
                        # >> 5269/200r ) QUANTITY <<
                        # >> 5267/200a CARDINAL <<
                        # >> 24/7 CARDINAL <<
                        # >> about 2 1/2 months DATE <<
                        # >> ceanorrett / ap photo airline ORG <<
                        # >> 9/11 DATE <<
                        # >> @eucopresident O <<
                        # >> com/mwco6qeity O <<
                        # >> psmith@bloomberglaw O <<
                        # >> jcasuga@bloomberglaw O <<
                        # >> ttritten@bgov com O <<
                        # >> aharris@bloomberglaw O <<
                        # >> astrazeneca / oxford ORG <<
                        # >> and/or O <<
                        # >> @realdonaldtrump O <<
                        # >> https O <<
                        # >> //t O <<
                        # >> co/m7hvwcv9ms O <<
                        # >> https O <<
                        # >> //t O <<
                        # >> co/biluulc0kq O <<
                        # >> com/4cplc24nzs O <<
                        # >> com/watwmfogqh O <<
                        # >> com/pqghkvjfwl O <<
                        # >> lbyington@bloombergindustry O <<
                        # >> hlowenkron@bloomberg O <<
                        # >> sbabbage@bgov com O <<
                        # >> rmeszoly@bgov com O <<
                        # >> https O <<
                        # >> //t O <<
                        # >> co/awypdmmagj O <<
                        # >> com/pk0brbtzyk O <<
                        # >> https O <<
                        # >> //apnews O <<
                        # >> com/hub/coronavirus O <<
                        # >> https O <<
                        # >> //apnews O <<
                        # >> com/hub/coronavirus O <<
                    else:
                        corrected_ent = get_corrected_ent_type(normalized_token, normalized_ent)
                        doc_tokens.append(corrected_ent)
        tokens.append(doc_tokens)
    return tokens


corpus_token_objects = corpus2tokens(json_load(f'merged_ents-{dataset}.json')[predictor])

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


def gen_tagged_docs_by_doc(token_objects, process_tokens_func, process_tags_func):
    tagged_docs = []
    for doc_id, doc in enumerate(token_objects):
        doc_tokens = [token for token, ent_type in doc]
        doc_tags = [doc_id]
        tagged_docs.append(TaggedDocument(doc_tokens, doc_tags))
    return tagged_docs

In [None]:
# corpus_full = gen_tagged_docs_by_sent(corpus_token_objects, process_tokens_sent, process_tags_sent)

In [None]:
corpus_full = gen_tagged_docs_by_doc(corpus_token_objects, process_tokens_doc, process_tags_doc)

In [None]:
corpus_full[489]

In [None]:
# https://groups.google.com/g/gensim/c/6JmSsx4iIv0
# projects with larger vocabularies tend to lean more towards negative-sampling than hierarchical-softmax
# VERY NB - https://stackoverflow.com/a/37502976/1782641
# https://radimrehurek.com/gensim/models/doc2vec.html
model = Doc2Vec(
    vector_size=300,
    epochs=200,
    min_count=10,
    window=10,
    hs=0,
    negative=20,
    sample=1e-3,
    workers=3  # 64
)

In [None]:
%%time
model.build_vocab(corpus_full)

In [None]:
print(f"Word 'airport' appeared {model.wv.get_vecattr('airport', 'count')} times in the full corpus.")

In [None]:
%%time
model.train(corpus_full, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.save("./doc2vec.model")

In [None]:
wv = model.wv
wv.save('./doc2vec.wv')

In [None]:
def corpus_to_dicts(corpus):
    for doc in corpus:
        yield {
            'tokens': doc.words,
            'tags': doc.tags
        }

In [None]:
import simplejson


def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
json_save(corpus_to_dicts(corpus_full), './doc2vec.corpus.full.json')

In [None]:
json_save(corpus_token_objects, './doc2vec.corpus_token_objects.json')