In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_train = pd.read_csv('./train.csv')
df_train.shape

In [None]:
df_test = pd.read_csv('./test.csv')
df_test.shape

In [None]:
# # dev
# df_train = df_train.sample(50, random_state=42)
# df_test = df_test.sample(50, random_state=42)

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.head(2)

In [None]:
corpus_text_train = df_train.text.tolist()

In [None]:
print(corpus_text_train[-1])

In [None]:
def friendly_tag_corpus_train(row):
    doc_categories = row.categories.split('\n')
    doc_title = row.title
    return [doc_title, *doc_categories]

In [None]:
corpus_tags_friendly_train = df_train[["title", "categories"]].apply(friendly_tag_corpus_train, axis=1).to_list()

In [None]:
corpus_tags_friendly_train[-1]

In [None]:
len(corpus_tags_friendly_train), len(corpus_text_train)

In [None]:
def build_tag_id_mapping(corpus_tags):
    tags = list(set(tag for tags in corpus_tags for tag in tags))
    return {tag: idx for idx, tag in enumerate(tags)}

In [None]:
tag_id_mapping = build_tag_id_mapping(corpus_tags_friendly_train)
id_tag_mapping = {v: k for k, v in tag_id_mapping.items()}

In [None]:
corpus_tags_train = [[tag_id_mapping[tag] for tag in tags] for tags in corpus_tags_friendly_train]

In [None]:
len(corpus_tags_train)

In [None]:
from tqdm.notebook import tqdm_notebook

In [None]:
from gensim.utils import simple_preprocess

def corpus2tokens(raw_corpus):
    return [simple_preprocess(doc) for doc in tqdm_notebook(raw_corpus)]

In [None]:
%%time
corpus_words_train = corpus2tokens(corpus_text_train)

In [None]:
corpus_words_train[-1][:20]

In [None]:
corpus_tags_train[-1]

In [None]:
[id_tag_mapping[tag] for tag in corpus_tags_train[-1]]

In [None]:
len(id_tag_mapping)

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def gen_tagged_docs(corpus_words, corpus_tags):
    return [TaggedDocument(doc_words, doc_tags) for doc_words, doc_tags in zip(corpus_words, corpus_tags)]

In [None]:
corpus_train = gen_tagged_docs(corpus_words_train, corpus_tags_train)

In [None]:
corpus_train[-1]

In [None]:
# https://groups.google.com/g/gensim/c/6JmSsx4iIv0
# projects with larger vocabularies tend to lean more towards negative-sampling than hierarchical-softmax
# VERY NB - https://stackoverflow.com/a/37502976/1782641
# https://radimrehurek.com/gensim/models/doc2vec.html
model = Doc2Vec(
    vector_size=1000,
    epochs=200,
    min_count=10,
    window=10,
    hs=0,
    negative=20,
    sample=1e-3,
    workers=16
)

In [None]:
%%time
model.build_vocab(corpus_train)

In [None]:
print(f"Word 'airport' appeared {model.wv.get_vecattr('airport', 'count')} times in the training corpus.")

In [None]:
%%time
model.train(corpus_train, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.save("./doc2vec.model")

In [None]:
wv = model.wv
wv.save('./doc2vec.wv')

In [None]:
def corpus_to_dicts(corpus):
    for doc in corpus:
        yield {
            'words': doc.words,
            'tags': doc.tags
        }

In [None]:
import simplejson


def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
json_save(corpus_to_dicts(corpus_train), './doc2vec.corpus.train.json')

In [None]:
json_save(tag_id_mapping, './doc2vec.tag_id_mapping.json')

In [None]:
# Process test corpus

In [None]:
df_test = pd.read_csv('./test.csv')
df_test.shape

In [None]:
corpus_text_test = df_test.text.tolist()

In [None]:
print(corpus_text_test[-1])

In [None]:
def friendly_tag_corpus_test(row):
    return row.categories.split('\n')

In [None]:
corpus_tags_friendly_test = df_test[["categories"]].apply(friendly_tag_corpus_test, axis=1).to_list()

In [None]:
corpus_tags_friendly_test[-1]

In [None]:
len(corpus_tags_friendly_test), len(corpus_text_test)

In [None]:
# weed out tags that were not seen in training
corpus_tags_test = [[tag_id_mapping.get(tag) for tag in tags] for tags in corpus_tags_friendly_test]

In [None]:
corpus_tags_friendly_test[89], corpus_tags_test[89]

In [None]:
%%time
corpus_words_test = corpus2tokens(corpus_text_test)

In [None]:
corpus_words_test[-1]

In [None]:
corpus_tags_test[5]

In [None]:
[id_tag_mapping[tag] for tag in corpus_tags_test[5] if tag]

In [None]:
corpus_test = gen_tagged_docs(corpus_words_test, corpus_tags_test)

In [None]:
json_save(corpus_to_dicts(corpus_test), './doc2vec.corpus.test.json')