In [None]:
import numpy as np
from gensim import matutils

def similarity_cosine(vec1, vec2):
    cosine_similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))
    return cosine_similarity

In [None]:
import simplejson

def json_load(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return simplejson.load(f)

In [None]:
from gensim.models.doc2vec import TaggedDocument

def gen_tagged_docs(corpus):
    return [TaggedDocument(doc["tokens"], doc["tags"]) for doc in corpus]

In [None]:
def display_similar_article_and_categories(corpus, doc_id=0, topn=10, by_article_tokens=True, by_article_tag=False):
    doc = corpus[doc_id].words
    print(' '.join(doc)[:200])

    if by_article_tokens:
        # Using words
        print("************")    
        print("Get simlarity based on tokens:")
        print()    
        inferred_vector = model.infer_vector(doc)
        sims = model.dv.most_similar([inferred_vector], topn=topn)
        for idx, factor in sims:
            print(factor, id_tag_map[idx])  

    if by_article_tag:
        # Using doc vector
        print("************")    
        print("Get simlarity based on article tag:")
        print()    
        inferred_vector = model.dv[corpus[doc_id].tags[0]]
        sims = model.dv.most_similar([inferred_vector], topn=topn)
        for idx, factor in sims:
            print(factor, id_tag_map[idx])
    
    print("************")
    print("Actual known tags:")
    print()
    print([id_tag_map.get(tag) for tag in corpus[doc_id].tags if tag])

In [None]:
import random
import collections
import matplotlib.pyplot as plt


def rank_by_inferredvector(corpus, sent_ids):
    ranks = []
    for sent_id in sent_ids:
        inferred_vector = model.infer_vector(corpus[sent_id].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(id_tag_map))
        most_similar_tag_indices = [
            [docid for docid, _ in sims].index(tag)
            for tag in corpus[sent_id].tags if tag
        ]
        if most_similar_tag_indices:
            rank = min(most_similar_tag_indices)
            print(f'{sent_id}: Ranked {rank} ({id_tag_map[sims[rank][0]]}) out of {len(sims)}')
            ranks.append(rank)
    return ranks

            
def rank_by_random(corpus, sent_ids):
    return [random.randint(0, len(id_tag_map)) for _ in sent_ids]


def plot_matches(corpus, rank_func=rank_by_inferredvector, take_sample=True, sample_size=50, sample_seed=42, topn_perc=0.1):
    if take_sample:
        random.seed(sample_seed)
        sent_ids = random.sample(range(0, len(corpus)), sample_size)
    else:
        sent_ids = list(range(len(corpus)))
    ranks = rank_func(corpus, sent_ids)
    counter = collections.Counter(ranks)
    group_0 = []
    group_1 = []
    group_2 = []
    for k, v in counter.items():
        if k == 0:
            group_0.append(v)
        elif k < len(id_tag_map) / (100 / topn_perc):
            group_1.append(v)
        else:
            group_2.append(v)
        sum_0 = sum(group_0)
        sum_1_acceptable = sum(group_1)
        sum_all_else = sum(group_2)
    plt.bar([0,1,2], [sum_0, sum_1_acceptable, sum_all_else])
    print([sum_0, sum_1_acceptable, sum_all_else])
    print('Test example correctly matched (%): ', 100 * sum_0 / sum([sum_0, sum_1_acceptable, sum_all_else]))
    print(f'Test example matched in top {topn_perc}% (%): ', 100 * sum_1_acceptable / sum([sum_0, sum_1_acceptable, sum_all_else]))
    print('Test example badly matched (%): ', 100 * sum_all_else / sum([sum_0, sum_1_acceptable, sum_all_else]))
    

def determine_matches_strict(corpus, sent_ids):
    sum_matches = 0
    sum_nomatches = 0
    for sent_id in sent_ids:
        inferred_vector = model.infer_vector(corpus[sent_id].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(id_tag_map))
        actual_tags = set(t for t, _ in sims[:len(corpus[sent_id].tags)])
        expected_tags = set(corpus[sent_id].tags)
        if actual_tags == expected_tags:
            sum_matches += 1
        else:
            print('actual_tags:', [id_tag_map[t] for t in actual_tags])
            print('expected_tags:', [id_tag_map[t] for t in expected_tags])
            sum_nomatches += 1
    return sum_matches, sum_nomatches
    
    
def plot_matches_strict(corpus, take_sample=True, sample_size=50, sample_seed=42):
    if take_sample:
        random.seed(sample_seed)
        sent_ids = random.sample(range(0, len(corpus)), sample_size)
    else:
        sent_ids = list(range(len(corpus)))
    sum_match, sum_nomatch = determine_matches_strict(corpus, sent_ids)
    plt.bar([0,1], [sum_match, sum_nomatch])
    print([sum_match, sum_nomatch])
    print('Test example correctly matched (%): ', 100 * sum_match / sum([sum_match, sum_nomatch]))
    print('Test example badly matched (%): ', 100 * sum_nomatch / sum([sum_match, sum_nomatch]))

In [None]:
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load('./doc2vec.model')

In [None]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load('./doc2vec.wv')

In [None]:
# corpus_train_raw = json_load('./doc2vec.corpus.train.json')
# corpus_test_raw = json_load('./doc2vec.corpus.test.json')

In [None]:
corpus_full = json_load('./doc2vec.corpus.full.json')

In [None]:
corpus_full[0]['tags']

In [None]:
# # dev
# corpus_train_raw = corpus_train_raw[:50]
# corpus_test_raw = corpus_test_raw[:50]

In [None]:
# corpus_train = gen_tagged_docs(corpus_train_raw)

In [None]:
tagged_corpus_full = gen_tagged_docs(corpus_full)

In [None]:
id_tag_map = json_load('./doc2vec.id_tag_map.json')

In [None]:
def convert_key(key):
    if isinstance(key, int):
        return key
    elif isinstance(key, list):
        return tuple(key)

tag_id_map = {convert_key(i): idx for idx, i in enumerate(id_tag_map)}

In [None]:
%matplotlib inline
plot_matches_strict(tagged_corpus_full)

In [None]:
%matplotlib inline
plot_matches(tagged_corpus_full)

In [None]:
# display_similar_article_and_categories(corpus_train, doc_id=425, by_article_tag=True)

In [None]:
tagged_corpus_full[61]

In [None]:
display_similar_article_and_categories(tagged_corpus_full, doc_id=327, by_article_tag=True)

In [None]:
display_similar_article_and_categories(tagged_corpus_full, doc_id=480, by_article_tag=True)

In [None]:
# Analyse (unseen) test set

In [None]:
# corpus_test = gen_tagged_docs(corpus_test_raw)

In [None]:
# %matplotlib inline
# plot_matches(corpus_test, take_sample=False)

In [None]:
# display_similar_article_and_categories(corpus_test, doc_id=13317)

In [None]:
# import pandas as pd
# df_test = pd.read_csv('./test.csv')

In [None]:
# df_test.iloc[5914]

In [None]:
# %matplotlib inline
# plot_matches(corpus_test, sample_size=1000)

In [None]:
# %matplotlib inline
# plot_matches(corpus_test, sample_size=1000, topn_perc=0.01)

In [None]:
doc_category_full = json_load('./lda.doc_category_full.json')
set_of_cats = list(set(doc_category_full))
doc_category_id_full = [set_of_cats.index(cat) for cat in doc_category_full]
doc_category_id_full

In [None]:
doc_category_full

In [None]:
doc_top_topics_full = json_load('./lda.doc_top_topics_full.json')

In [None]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

doc_vecs = [
    model.dv[v] for k,v in tag_id_map.items() # if 'title:' in k
]
X = np.stack(doc_vecs, axis=0, out=None)

sc = StandardScaler()

scaler = StandardScaler()
scaler.fit(X) 
X_scaled = scaler.transform(X)

pca = PCA(n_components=3)
pca.fit(X_scaled) 
X_pca = pca.transform(X_scaled) 

Xax = X_pca[:,0]
Yax = X_pca[:,1]
Zax = X_pca[:,2]

# plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(Xax, Yax, Zax, c=doc_top_topics_full, s=20)
ax.view_init(30, 185)
plt.show()

In [None]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

doc_vecs = [
    model.dv[v] for k,v in tag_id_map.items() # if 'title:' in k
]
X = np.stack(doc_vecs, axis=0, out=None)

sc = StandardScaler()

scaler = StandardScaler()
scaler.fit(X) 
X_scaled = scaler.transform(X)

pca = PCA(n_components=2)
pca.fit(X_scaled) 
X_pca = pca.transform(X_scaled) 

Xax = X_pca[:,0]
Yax = X_pca[:,1]

# plot
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(Xax, Yax, c=doc_top_topics_full, s=20)
plt.show()

In [None]:
sims = []
import itertools
for a, b in itertools.combinations(list(range(490)), 2):
    sims.append((a, b, similarity_cosine(model.dv[a], model.dv[b]),))

In [None]:
a, b, sim = zip(*sims)

In [None]:
sims

In [None]:
display_similar_article_and_categories(tagged_corpus_full, doc_id=219, by_article_tag=True)

In [None]:
display_similar_article_and_categories(tagged_corpus_full, doc_id=11, by_article_tag=True)

In [None]:
sim

In [None]:
for i in sims:
    if i[2] > 0.6:
        print(i)

In [None]:
corpus_full[103]

In [None]:
corpus_full[301]

In [None]:
display_similar_article_and_categories(tagged_corpus_full, doc_id=319, by_article_tag=True)

In [None]:
corpus_full[29]

In [None]:
corpus_full[319]