In [None]:
import simplejson

def json_load(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return simplejson.load(f)

In [None]:
from gensim.models.doc2vec import TaggedDocument

def gen_tagged_docs(corpus):
    return [TaggedDocument(doc["words"], doc["tags"]) for doc in corpus]

In [None]:
def display_similar_article_and_categories(corpus, doc_id=0, topn=10, by_article_tokens=True, by_article_tag=False):
    doc = corpus[doc_id].words
    print(' '.join(doc)[:200])

    if by_article_tokens:
        # Using words
        print("************")    
        print("Get simlarity based on tokens:")
        print()    
        inferred_vector = model.infer_vector(doc)
        sims = model.dv.most_similar([inferred_vector], topn=topn)
        for idx, factor in sims:
            print(factor, id_tag_mapping[idx])  

    if by_article_tag:
        # Using doc vector
        print("************")    
        print("Get simlarity based on article tag:")
        print()    
        inferred_vector = model.dv[corpus[doc_id].tags[0]]
        sims = model.dv.most_similar([inferred_vector], topn=topn)
        for idx, factor in sims:
            print(factor, id_tag_mapping[idx])
    
    print("************")
    print("Actual known tags:")
    print()
    print([id_tag_mapping.get(tag) for tag in corpus[doc_id].tags if tag])

In [None]:
import random
import collections
import matplotlib.pyplot as plt


def rank_by_inferredvector(corpus, sent_ids):
    ranks = []
    for sent_id in sent_ids:
        inferred_vector = model.infer_vector(corpus[sent_id].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(id_tag_mapping))
        most_similar_tag_indices = [
            [docid for docid, _ in sims].index(tag)
            for tag in corpus[sent_id].tags if tag
        ]
        if most_similar_tag_indices:
            rank = min(most_similar_tag_indices)
            print(f'{sent_id}: Ranked {rank} ({id_tag_mapping[sims[rank][0]]}) out of {len(sims)}')
            ranks.append(rank)
    return ranks

            
def rank_by_random(corpus, sent_ids):
    return [random.randint(0, len(id_tag_mapping)) for _ in sent_ids]


def plot_matches(corpus, rank_func=rank_by_inferredvector, take_sample=True, sample_size=50, sample_seed=42, topn_perc=0.1):
    if take_sample:
        random.seed(sample_seed)
        sent_ids = random.sample(range(0, len(corpus)), sample_size)
    else:
        sent_ids = list(range(len(corpus)))
    ranks = rank_func(corpus, sent_ids)
    counter = collections.Counter(ranks)
    group_0 = []
    group_1 = []
    group_2 = []
    for k, v in counter.items():
        if k == 0:
            group_0.append(v)
        elif k < len(id_tag_mapping) / (100 / topn_perc):
            group_1.append(v)
        else:
            group_2.append(v)
        sum_0 = sum(group_0)
        sum_1_acceptable = sum(group_1)
        sum_all_else = sum(group_2)
    plt.bar([0,1,2], [sum_0, sum_1_acceptable, sum_all_else])
    print([sum_0, sum_1_acceptable, sum_all_else])
    print('Test example correctly matched (%): ', 100 * sum_0 / sum([sum_0, sum_1_acceptable, sum_all_else]))
    print(f'Test example matched in top {topn_perc}% (%): ', 100 * sum_1_acceptable / sum([sum_0, sum_1_acceptable, sum_all_else]))
    print('Test example badly matched (%): ', 100 * sum_all_else / sum([sum_0, sum_1_acceptable, sum_all_else]))

In [None]:
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load('./doc2vec.model')

In [None]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load('./doc2vec.wv')

In [None]:
# corpus_train_raw = json_load('./doc2vec.corpus.train.json')
# corpus_test_raw = json_load('./doc2vec.corpus.test.json')

In [None]:
corpus_full = json_load('./doc2vec.corpus.full.json')

In [None]:
entity_types = set(word.split('|')[0] for doc in corpus_full for word in doc['words'] if '|' in word)
entity_types

In [None]:
def save_image(word_count_dict, label):
    wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(word_count_dict)
    plt.figure(figsize=(15,8))
    plt.imshow(wordcloud)
    plt.axis("off")
    #plt.show()
    plt.savefig(f'{label}.png', bbox_inches='tight')
    plt.close()

In [None]:
extra_stopwords = [
    "DATE", # Absolute or relative dates or periods
    "CARDINAL", # Numerals that do not fall under another type
    "PERCENT", # Percentage, including "%"
    "TIME", # Times smaller than a day
    "MONEY", # Monetary values, including unit
    "ORDINAL", # "first", "second", etc.
    "QUANTITY", # Measurements, as of weight or distance
    "said"
]

from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from spacy.lang.en import STOP_WORDS

for entity_type in entity_types:
    word_count_dict=Counter(list(word.split('|')[-1] for doc in corpus_full for word in doc['words'] if entity_type in word))
    save_image(word_count_dict, entity_type)
word_count_dict=Counter(list(word.split('|')[-1] for doc in corpus_full for word in doc['words'] if word.split('|')[-1] not in STOP_WORDS and word.split('|')[-1] not in extra_stopwords))
save_image(word_count_dict, 'ALL')

In [None]:
'hello|there'.split('|')

In [None]:
# # dev
# corpus_train_raw = corpus_train_raw[:50]
# corpus_test_raw = corpus_test_raw[:50]

In [None]:
# corpus_train = gen_tagged_docs(corpus_train_raw)

In [None]:
tagged_corpus_full = gen_tagged_docs(corpus_full)

In [None]:
tag_id_mapping = json_load('./doc2vec.tag_id_mapping.json')
id_tag_mapping = {v: k for k, v in tag_id_mapping.items()}

In [None]:
# %matplotlib inline
# plot_matches(corpus_train)

In [None]:
%matplotlib inline
plot_matches(tagged_corpus_full)

In [None]:
# display_similar_article_and_categories(corpus_train, doc_id=425, by_article_tag=True)

In [None]:
tagged_corpus_full[425]

In [None]:
display_similar_article_and_categories(tagged_corpus_full, doc_id=425, by_article_tag=True)

In [None]:
# Analyse (unseen) test set

In [None]:
# corpus_test = gen_tagged_docs(corpus_test_raw)

In [None]:
# %matplotlib inline
# plot_matches(corpus_test, take_sample=False)

In [None]:
# display_similar_article_and_categories(corpus_test, doc_id=13317)

In [None]:
# import pandas as pd
# df_test = pd.read_csv('./test.csv')

In [None]:
# df_test.iloc[5914]

In [None]:
# %matplotlib inline
# plot_matches(corpus_test, sample_size=1000)

In [None]:
# %matplotlib inline
# plot_matches(corpus_test, sample_size=1000, topn_perc=0.01)