In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_full = pd.read_csv('./full.csv')
df_full.shape

In [None]:
df_full.sample(10)

In [None]:
df_full.iloc[395].text

In [None]:
# df_train = pd.read_csv('./train.csv')
# df_train.shape

In [None]:
# df_test = pd.read_csv('./test.csv')
# df_test.shape

In [None]:
# # dev
# df_train = df_train.sample(50, random_state=42)
# df_test = df_test.sample(50, random_state=42)

In [None]:
# df_train.shape, df_test.shape

In [None]:
# df_train.head(2)

In [None]:
# corpus_text_train = df_train.text.tolist()

In [None]:
corpus_text_full = df_full.text.tolist()

In [None]:
# print(corpus_text_train[-1])

In [None]:
print(corpus_text_full[-1][:200])

In [None]:
# def friendly_tag_corpus_train(row):
#     doc_categories = row.categories.split('\n')
#     doc_title = row.title
#     return [doc_title, *doc_categories]

In [None]:
import ast

def friendly_tag_corpus_full(row):
    if str(row.category) == "nan":
        doc_category = None
    else:
        doc_category = row.category
    if not doc_category:
        doc_category = None
    doc_title = row.title
    return [
        f'title:{doc_title}',
        *[f'category:{d}' for d in [doc_category] if d is not None],
    ]

In [None]:
# corpus_tags_friendly_train = df_train[["title", "categories"]].apply(friendly_tag_corpus_train, axis=1).to_list()

In [None]:
corpus_tags_friendly_full = df_full[["title", "category"]].apply(friendly_tag_corpus_full, axis=1).to_list()

In [None]:
# corpus_tags_friendly_train[-1]

In [None]:
corpus_tags_friendly_full[-6]

In [None]:
# len(corpus_tags_friendly_train), len(corpus_text_train)

In [None]:
len(corpus_tags_friendly_full), len(corpus_text_full)

In [None]:
def build_tag_id_mapping(corpus_tags):
    tags = list(set(tag for tags in corpus_tags for tag in tags))
    return {tag: idx for idx, tag in enumerate(tags)}

In [None]:
# tag_id_mapping = build_tag_id_mapping(corpus_tags_friendly_train)
# id_tag_mapping = {v: k for k, v in tag_id_mapping.items()}

In [None]:
tag_id_mapping = build_tag_id_mapping(corpus_tags_friendly_full)
id_tag_mapping = {v: k for k, v in tag_id_mapping.items()}

In [None]:
# corpus_tags_train = [[tag_id_mapping[tag] for tag in tags] for tags in corpus_tags_friendly_train]

In [None]:
corpus_tags_full = [[tag_id_mapping[tag] for tag in tags] for tags in corpus_tags_friendly_full]

In [None]:
# len(corpus_tags_train)

In [None]:
len(corpus_tags_full)

In [None]:
from tqdm.notebook import tqdm_notebook

In [None]:
# from gensim.utils import simple_preprocess

# def corpus2tokens(raw_corpus):
#     return [simple_preprocess(doc) for doc in tqdm_notebook(raw_corpus)]

In [None]:
import spacy
from spacy.language import Language
from spacy.lang.en import STOP_WORDS

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("merge_entities")
# nlp.add_pipe("merge_noun_chunks")
nlp.pipe_names

In [None]:
import tqdm

single_quote_unicode = ord("'")
translation_table_text = str.maketrans(
    {
        '`': single_quote_unicode,
        '‘': single_quote_unicode,
        '’': single_quote_unicode,
        '“': single_quote_unicode,
        '”': single_quote_unicode,
        '-': None,
    }
)
translation_table_token = str.maketrans(
    {
        "'": None,
        '"': None,
        '.': None
    }
)

def substitute_token(token):
    token_lowered = token.lower()
    if 'http' in token_lowered:
        print(f'WARNING: {token_lowered} is getting converted to URL')
        return 'URL'
    elif '@' in token_lowered:
        print(f'WARNING: {token_lowered} is getting converted to USERMENTION')
        return 'USERMENTION'
    elif '&amp;' in token_lowered:
        print(f'WARNING: {token_lowered} is getting converted to "and"')
        return 'and'
    elif "ain't" in token_lowered:
        print(f'WARNING: {token_lowered} is getting converted to "am not"')
        return 'am not'
    elif '\x89û_' in token_lowered:
        print(f'WARNING: {token_lowered} is getting converted to "{token_lowered[:-3]}"')
        return f'{token_lowered[:-3]} ...'
    else:
        return token

def corpus2tokens(corpus_text, *args, **kwargs):
    corpus_text = [' '.join([substitute_token(token) for token in text.translate(translation_table_token).split()]) for text in corpus_text]
    return [doc2tokens(doc) for doc in nlp.pipe(tqdm.notebook.tqdm(corpus_text), *args, **kwargs)]

def doc2tokens(doc):
    tokens = [token for token in doc if not (token.is_punct or token.is_space)]
    return process_tokens(tokens, doc.ents)

def show_ents(ents):
    for ent in ents:
        print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_)))

def process_tokens(tokens, ents, rm_stopwords=False):
    ent_vals_to_skip = ['#', '\\\\\\']
    ent_labels_to_sub = [
        "DATE", # Absolute or relative dates or periods
        "CARDINAL", # Numerals that do not fall under another type
        "PERCENT", # Percentage, including "%"
        "TIME", # Times smaller than a day
        "MONEY", # Monetary values, including unit
        "ORDINAL", # "first", "second", etc.
        "QUANTITY", # Measurements, as of weight or distance
    ]
    tokens_processed = []
    stringed_ents = [ent.text.lower() for ent in ents if ent.text not in ent_vals_to_skip]
    ent_tokens = []
    for token in tokens:
        stringed_token = token.text.lower()
        if stringed_token in stringed_ents:
            ent_tokens.append(stringed_token)
            ent_label = ents[stringed_ents.index(stringed_token)].label_
            if ent_label in ent_labels_to_sub:
                tokens_processed.append(ent_label)
                continue
            stringed_token = ent_label + "|" + stringed_token.translate(translation_table_token)
        if rm_stopwords:
            if stringed_token not in STOP_WORDS:
                tokens_processed.append(stringed_token)
        else:
            tokens_processed.append(stringed_token)
    len_ent_tokens = len(set(ent_tokens))
    len_stringed_ents = len(set(stringed_ents))
    if len_ent_tokens != len_stringed_ents:
        print(f'WARNING: Somehow the number of unique tokens which are ents ({len_ent_tokens}) does not match the total number of unique ents ({len_stringed_ents})')
        diff = list(set(stringed_ents) - set(ent_tokens))
        if not diff:
            diff = list(set(ent_tokens) - set(stringed_ents))
            print(diff, "exist in tokens but not in ents")
        print(diff, "exist in ents but not in tokens")
        print("tokens: ", "\n", tokens, "\n\n")
        print("ents: ", "\n", ents, "\n\n")
    return tokens_processed

In [None]:
# %%time
# corpus_words_train = corpus2tokens(corpus_text_train)

In [None]:
%%time
corpus_words_full = corpus2tokens(corpus_text_full)

In [None]:
corpus_words_full[-1][:50]

In [None]:
spacy.explain('GPE')

In [None]:
# corpus_tags_train[-1]

In [None]:
corpus_tags_full[-1]

In [None]:
# [id_tag_mapping[tag] for tag in corpus_tags_train[-1]]

In [None]:
[id_tag_mapping[tag] for tag in corpus_tags_full[-1]]

In [None]:
len(id_tag_mapping)

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def gen_tagged_docs(corpus_words, corpus_tags):
    return [TaggedDocument(doc_words, doc_tags) for doc_words, doc_tags in zip(corpus_words, corpus_tags)]

In [None]:
# corpus_train = gen_tagged_docs(corpus_words_train, corpus_tags_train)

In [None]:
corpus_full = gen_tagged_docs(corpus_words_full, corpus_tags_full)

In [None]:
# corpus_train[-1]

In [None]:
corpus_full[-1]

In [None]:
# https://groups.google.com/g/gensim/c/6JmSsx4iIv0
# projects with larger vocabularies tend to lean more towards negative-sampling than hierarchical-softmax
# VERY NB - https://stackoverflow.com/a/37502976/1782641
# https://radimrehurek.com/gensim/models/doc2vec.html
model = Doc2Vec(
    vector_size=1000,
    epochs=200,
    min_count=10,
    window=10,
    hs=0,
    negative=20,
    sample=1e-3,
    workers=3  # 64
)

In [None]:
# %%time
# model.build_vocab(corpus_train)

In [None]:
%%time
model.build_vocab(corpus_full)

In [None]:
# print(f"Word 'airport' appeared {model.wv.get_vecattr('airport', 'count')} times in the training corpus.")

In [None]:
print(f"Word 'airport' appeared {model.wv.get_vecattr('airport', 'count')} times in the full corpus.")

In [None]:
# %%time
# model.train(corpus_train, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
%%time
model.train(corpus_full, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.save("./doc2vec.model")

In [None]:
wv = model.wv
wv.save('./doc2vec.wv')

In [None]:
def corpus_to_dicts(corpus):
    for doc in corpus:
        yield {
            'words': doc.words,
            'tags': doc.tags
        }

In [None]:
import simplejson


def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
# json_save(corpus_to_dicts(corpus_train), './doc2vec.corpus.train.json')

In [None]:
json_save(corpus_to_dicts(corpus_full), './doc2vec.corpus.full.json')

In [None]:
json_save(tag_id_mapping, './doc2vec.tag_id_mapping.json')

In [None]:
# Process test corpus

In [None]:
# df_test = pd.read_csv('./test.csv')
# df_test.shape

In [None]:
# corpus_text_test = df_test.text.tolist()

In [None]:
# print(corpus_text_test[-1])

In [None]:
# def friendly_tag_corpus_test(row):
#     return row.categories.split('\n')

In [None]:
# corpus_tags_friendly_test = df_test[["categories"]].apply(friendly_tag_corpus_test, axis=1).to_list()

In [None]:
# corpus_tags_friendly_test[-1]

In [None]:
# len(corpus_tags_friendly_test), len(corpus_text_test)

In [None]:
# # weed out tags that were not seen in training
# corpus_tags_test = [[tag_id_mapping.get(tag) for tag in tags] for tags in corpus_tags_friendly_test]

In [None]:
# corpus_tags_friendly_test[89], corpus_tags_test[89]

In [None]:
# %%time
# corpus_words_test = corpus2tokens(corpus_text_test)

In [None]:
# corpus_words_test[-1]

In [None]:
# corpus_tags_test[5]

In [None]:
# [id_tag_mapping[tag] for tag in corpus_tags_test[5] if tag]

In [None]:
# corpus_test = gen_tagged_docs(corpus_words_test, corpus_tags_test)

In [None]:
# json_save(corpus_to_dicts(corpus_test), './doc2vec.corpus.test.json')