In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import pandas as pd
import numpy as np

pd.options.display.max_colwidth = 120

In [None]:
df = pd.read_csv('/home/ubuntu/internship/data/travel-wiki-extract-full-templates-processed.csv')
df.shape

In [None]:
df = df.dropna(subset=["text"])
df.shape

In [None]:
# # dev
# df = df.sample(50, random_state=42)
# df.shape

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.head(2)

In [None]:
corpus_text = train_df.text.tolist()

In [None]:
print(corpus_text[-1])

In [None]:
def friendly_tag_corpus(row):
    doc_categories = row.categories.split('\n')
    doc_title = row.title
    return [doc_title, *doc_categories]

In [None]:
corpus_tags_friendly = train_df[["title", "categories"]].apply(friendly_tag_corpus, axis=1).to_list()

In [None]:
corpus_tags_friendly[-1]

In [None]:
len(corpus_tags_friendly), len(corpus_text)

In [None]:
def build_tag_id_mapping(corpus_tags):
    tags = list(set(tag for tags in corpus_tags for tag in tags))
    return {tag: idx for idx, tag in enumerate(tags)}

In [None]:
tag_id_mapping = build_tag_id_mapping(corpus_tags_friendly)

In [None]:
id_tag_mapping = {v: k for k, v in tag_id_mapping.items()}

In [None]:
corpus_tags = [[tag_id_mapping[tag] for tag in tags] for tags in corpus_tags_friendly]

In [None]:
len(corpus_tags)

In [None]:
from tqdm.notebook import tqdm_notebook

In [None]:
import spacy
from spacy.language import Language

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("merge_entities")
nlp.pipe_names

In [None]:
def corpus2tokens(corpus_text, *args, **kwargs):
    return [list(doc2tokens(doc)) for doc in nlp.pipe(tqdm_notebook(corpus_text), *args, **kwargs)]

def doc2tokens(doc):
    return [token.text.lower() for token in doc if not (token.is_punct or token.is_space)]

In [None]:
%%time
corpus_words = corpus2tokens(corpus_text, batch_size=40, n_process=8)

In [None]:
corpus_words[-1]

In [None]:
corpus_tags[-1]

In [None]:
[id_tag_mapping[tag] for tag in corpus_tags[-1]]

In [None]:
len(id_tag_mapping)

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def gen_tagged_docs(corpus_words, corpus_tags):
    for doc_words, doc_tags in zip(corpus_words, corpus_tags):
        yield TaggedDocument(doc_words, doc_tags)

In [None]:
train_corpus = list(gen_tagged_docs(corpus_words, corpus_tags))

In [None]:
train_corpus[-1]

In [None]:
# https://groups.google.com/g/gensim/c/6JmSsx4iIv0
# projects with larger vocabularies tend to lean more towards negative-sampling than hierarchical-softmax
# VERY NB - https://stackoverflow.com/a/37502976/1782641
# https://radimrehurek.com/gensim/models/doc2vec.html
model = Doc2Vec(
    vector_size=1000,
    epochs=200,
    min_count=10,
    window=10,
    hs=0,
    negative=20,
    sample=1e-3,
    workers=16
)

In [None]:
%%time
model.build_vocab(train_corpus)

In [None]:
print(f"Word 'airport' appeared {model.wv.get_vecattr('airport', 'count')} times in the training corpus.")

In [None]:
%%time
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.save("./doc2vec.model")

In [None]:
wv = model.wv
wv.save('./doc2vec.wv')

In [None]:
[id_tag_mapping[tag] for tag in train_corpus[0].tags]

In [None]:
import random

random.seed(42)


ranks = []
second_ranks = []

train_corpus_copy = train_corpus.copy()
random.shuffle(train_corpus_copy)
sample_train_corpus = train_corpus_copy[:50]
for sent_id in range(len(sample_train_corpus)):
    inferred_vector = model.infer_vector(sample_train_corpus[sent_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(id_tag_mapping))
    most_similar_tag_indices = [
        [docid for docid, sim in sims].index(tag)
        for tag in sample_train_corpus[sent_id].tags
    ]
    rank = min(most_similar_tag_indices)
    second_rank = max(most_similar_tag_indices) + 1
    ranks.append(rank)
    second_ranks.append(second_rank)

In [None]:
%matplotlib inline

import collections
import matplotlib.pyplot as plt


counter = collections.Counter(ranks)
sum_0 = sum([v for k, v in counter.items() if k <= 0])
sum_all_else = sum([v for k, v in counter.items() if k > 0])
plt.bar([0,1], [sum_0, sum_all_else])
print([sum_0, sum_all_else])

In [None]:
print('Training example correctly matched (%): ', 100 * sum_0 / (sum_0 + sum_all_else))
print('Training example incorrectly matched (%): ', 100 * sum_all_else / (sum_0 + sum_all_else))

In [None]:
def stream_corpus_to_dicst(corpus):
    for doc in corpus:
        yield {
            'words': doc.words,
            'tags': doc.tags
        }

In [None]:
import simplejson


def json_save(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
json_save(stream_corpus_to_dicst(train_corpus), './doc2vec.corpus.json')

In [None]:
json_save(tag_id_mapping, './doc2vec.tag_id_mapping.json')

In [None]:
# Process test corpus

In [None]:
df_test = pd.read_csv('./test.csv')
df_test.shape

In [None]:
corpus_text_test = df_test.text.tolist()

In [None]:
print(corpus_text_test[-1])

In [None]:
def friendly_tag_test_corpus(row):
    return row.categories.split('\n')

In [None]:
corpus_tags_friendly_test = df_test[["categories"]].apply(friendly_tag_test_corpus, axis=1).to_list()

In [None]:
corpus_tags_friendly_test[-1]

In [None]:
len(corpus_tags_friendly_test), len(corpus_text_test)

In [None]:
# weed out tags that were not seen in training
corpus_tags_test = [[tag_id_mapping.get(tag) for tag in tags] for tags in corpus_tags_friendly_test]

In [None]:
corpus_tags_friendly_test[89], corpus_tags_test[89]

In [None]:
%%time
corpus_words_test = corpus2tokens(corpus_text_test)

In [None]:
corpus_words_test[-1]

In [None]:
corpus_tags_test[-1]

In [None]:
[id_tag_mapping[tag] for tag in corpus_tags_test[-1]]

In [None]:
corpus_test = list(gen_tagged_docs(corpus_words_test, corpus_tags_test))

In [None]:
json_save(stream_corpus_to_dicst(corpus_test), './doc2vec.corpus.test.json')