In [None]:
import numpy as np
from gensim import matutils

def similarity_cosine(vec1, vec2):
    cosine_similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))
    return cosine_similarity

In [None]:
import simplejson

def json_load(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return simplejson.load(f)

In [None]:
from gensim.models.doc2vec import TaggedDocument

def gen_tagged_docs(corpus):
    return [TaggedDocument(doc["tokens"], doc["tags"]) for doc in corpus]

In [None]:
def display_similar_article_and_categories(corpus, doc_id=0, topn=10, by_article_tokens=True, by_article_tag=False):
    doc = corpus[doc_id].words
    print(' '.join(doc)[:200])

    if by_article_tokens:
        # Using words
        print("************")    
        print("Get simlarity based on tokens:")
        print()    
        inferred_vector = model.infer_vector(doc)
        sims = model.dv.most_similar([inferred_vector], topn=topn)
        for idx, factor in sims:
            print(factor, idx)  

    if by_article_tag:
        # Using doc vector
        print("************")    
        print("Get simlarity based on article tag:")
        print()    
        inferred_vector = model.dv[corpus[doc_id].tags[0]]
        sims = model.dv.most_similar([inferred_vector], topn=topn)
        for idx, factor in sims:
            print(factor, idx)
    
    print("************")
    print("Actual known tags:")
    print()
    print([tag for tag in corpus[doc_id].tags if tag])

In [None]:
import random
import collections
import matplotlib.pyplot as plt


def rank_by_inferredvector(corpus, sent_ids):
    ranks = []
    for sent_id in sent_ids:
        inferred_vector = model.infer_vector(corpus[sent_id].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(corpus))
        most_similar_tag_indices = [
            [docid for docid, _ in sims].index(tag)
            for tag in corpus[sent_id].tags if tag
        ]
        if most_similar_tag_indices:
            rank = min(most_similar_tag_indices)
            print(f'{sent_id}: Ranked {rank} ({sims[rank][0]}) out of {len(sims)}')
            ranks.append(rank)
    return ranks

            
def rank_by_random(corpus, sent_ids):
    return [random.randint(0, len(corpus)) for _ in sent_ids]


def plot_matches(corpus, rank_func=rank_by_inferredvector, take_sample=True, sample_size=50, sample_seed=42, topn_perc=0.1):
    if take_sample:
        random.seed(sample_seed)
        sent_ids = random.sample(range(0, len(corpus)), sample_size)
    else:
        sent_ids = list(range(len(corpus)))
    ranks = rank_func(corpus, sent_ids)
    counter = collections.Counter(ranks)
    group_0 = []
    group_1 = []
    group_2 = []
    for k, v in counter.items():
        if k == 0:
            group_0.append(v)
        elif k < len(corpus) / (100 / topn_perc):
            group_1.append(v)
        else:
            group_2.append(v)
        sum_0 = sum(group_0)
        sum_1_acceptable = sum(group_1)
        sum_all_else = sum(group_2)
    plt.bar([0,1,2], [sum_0, sum_1_acceptable, sum_all_else])
    print([sum_0, sum_1_acceptable, sum_all_else])
    print('Test example correctly matched (%): ', 100 * sum_0 / sum([sum_0, sum_1_acceptable, sum_all_else]))
    print(f'Test example matched in top {topn_perc}% (%): ', 100 * sum_1_acceptable / sum([sum_0, sum_1_acceptable, sum_all_else]))
    print('Test example badly matched (%): ', 100 * sum_all_else / sum([sum_0, sum_1_acceptable, sum_all_else]))
    

def determine_matches_strict(corpus, sent_ids):
    sum_matches = 0
    sum_nomatches = 0
    for sent_id in sent_ids:
        inferred_vector = model.infer_vector(corpus[sent_id].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(corpus))
        actual_tags = set(t for t, _ in sims[:len(corpus[sent_id].tags)])
        expected_tags = set(corpus[sent_id].tags)
        if actual_tags == expected_tags:
            sum_matches += 1
        else:
            print('actual_tags:', [t for t in actual_tags])
            print('expected_tags:', [t for t in expected_tags])
            sum_nomatches += 1
    return sum_matches, sum_nomatches
    
    
def plot_matches_strict(corpus, take_sample=True, sample_size=50, sample_seed=42):
    if take_sample:
        random.seed(sample_seed)
        sent_ids = random.sample(range(0, len(corpus)), sample_size)
    else:
        sent_ids = list(range(len(corpus)))
    sum_match, sum_nomatch = determine_matches_strict(corpus, sent_ids)
    plt.bar([0,1], [sum_match, sum_nomatch])
    print([sum_match, sum_nomatch])
    print('Test example correctly matched (%): ', 100 * sum_match / sum([sum_match, sum_nomatch]))
    print('Test example badly matched (%): ', 100 * sum_nomatch / sum([sum_match, sum_nomatch]))

In [None]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler

def cluster_plot(X, n, cluster_ids, **kwargs):
    sc = StandardScaler()

    scaler = StandardScaler(**kwargs)
    scaler.fit(X) 
    X_scaled = scaler.transform(X)

    pca = PCA(n_components=n)
    pca.fit(X_scaled) 
    X_pca = pca.transform(X_scaled) 

    Xax = X_pca[:,0]
    Yax = X_pca[:,1]

    # plot
    fig = plt.figure()
    if n == 3:
        Zax = X_pca[:,2]
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(Xax, Yax, Zax, c=cluster_ids, s=20)
        ax.view_init(30, 185)
    elif n == 2:
        ax = fig.add_subplot(111)
        ax.scatter(Xax, Yax, c=cluster_ids, s=20)
    else:
        print('Invalid')
        return
    plt.show()

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

def kmeans_plot(vecs, k_range):
    x = np.stack(vecs)

    distortions = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(x)
        distortions.append(kmeans.inertia_)

    plt.figure(figsize=(16,8))
    plt.plot(k_range, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

In [None]:
def get_cluster_ids(n, vecs):
    x = np.stack(vecs)
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(x)
    cluster_ids = kmeans.predict(x)
    return cluster_ids

In [None]:
import itertools


def get_pairwise_simularities(vectors):
    sims = []
    for a, b in itertools.combinations(list(range(490)), 2):
        sims.append((a, b, similarity_cosine(vectors[a], vectors[b]),))
    a, b, sim = zip(*sims)
    for i in sims:
        if i[2] > 0.8:
            print(i)
    return sims

In [None]:
from gensim.models.doc2vec import Doc2Vec
model = Doc2Vec.load('./doc2vec.model')

In [None]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load('./doc2vec.wv')

In [None]:
corpus_full = json_load('./doc2vec.corpus.full.json')

In [None]:
tagged_corpus_full = gen_tagged_docs(corpus_full)

In [None]:
# %matplotlib inline
# plot_matches_strict(tagged_corpus_full)

In [None]:
display_similar_article_and_categories(tagged_corpus_full, doc_id=319, by_article_tag=True)

In [None]:
display_similar_article_and_categories(tagged_corpus_full, doc_id=29, by_article_tag=True)

In [None]:
doc_vecs_unfiltered = [model.dv[v] for v in range(len(corpus_full))]
kmeans_plot(doc_vecs_unfiltered, range(1,40))

In [None]:
cluster_ids_unfiltered = get_cluster_ids(2, doc_vecs_unfiltered)

In [None]:
cluster_plot(doc_vecs_unfiltered, 2, cluster_ids_unfiltered)

In [None]:
%matplotlib notebook

cluster_plot(doc_vecs_unfiltered, 3, cluster_ids_unfiltered)

In [None]:
sims_unfiltered = get_pairwise_simularities(model.dv)

In [None]:
informing_ents = [
    'PERSON',
    'NORP',
    'ORG',
    'EVENT',
    'LANGUAGE',
    'LOC',
    'GPE',
    'FAC',
    'LAW',
    'PRODUCT',
    'MISC'
]

corpus_filtered_by_ents = [[token for token, ent_type in doc if ent_type in informing_ents] for doc in json_load('./doc2vec.corpus_token_objects.json')]
tagged_corpus_filtered_by_ents = [TaggedDocument(doc, [doc_id]) for doc_id, doc in enumerate(corpus_filtered_by_ents)]
doc_vecs_filtered_by_ents = [
    model.infer_vector(doc) for doc in corpus_filtered_by_ents
]

In [None]:
%matplotlib inline

kmeans_plot(doc_vecs_filtered_by_ents, range(1,40))

In [None]:
clusters_filtered_by_ents = get_cluster_ids(3, doc_vecs_filtered_by_ents)

In [None]:
cluster_plot(doc_vecs_filtered_by_ents, 2, clusters_filtered_by_ents)

In [None]:
%matplotlib notebook

cluster_plot(doc_vecs_filtered_by_ents, 3, clusters_filtered_by_ents)

In [None]:
sims_filtered_by_ents = get_pairwise_simularities(doc_vecs_filtered_by_ents)

In [None]:
corpus_full[414]

In [None]:
corpus_full[431]

In [None]:
display_similar_article_and_categories(tagged_corpus_filtered_by_ents, doc_id=29)

In [None]:
display_similar_article_and_categories(tagged_corpus_filtered_by_ents, doc_id=319)

In [None]:
corpus_filtered_by_ents[11]

In [None]:
corpus_filtered_by_ents[319]

In [None]:
corpus_filtered_by_ents[29]

In [None]:
corpus_full[11]

In [None]:
corpus_full[319]

In [None]:
corpus_full[29]

In [None]:
informing_ents_reduced = [
    'PERSON',
#     'NORP',
    'ORG',
#     'EVENT',
#     'LANGUAGE',
    'LOC',
    'GPE',
    'FAC',
#     'LAW',
#     'PRODUCT',
#     'MISC'
]

corpus_filtered_by_ents_reduced = [[token for token, ent_type in doc if ent_type in informing_ents_reduced] for doc in json_load('./doc2vec.corpus_token_objects.json')]
tagged_corpus_filtered_by_ents_reduced = [TaggedDocument(doc, [doc_id]) for doc_id, doc in enumerate(corpus_filtered_by_ents_reduced)]
doc_vecs_filtered_by_ents_reduced = [
    model.infer_vector(doc) for doc in corpus_filtered_by_ents_reduced
]

In [None]:
%matplotlib inline

kmeans_plot(doc_vecs_filtered_by_ents_reduced, range(1,40))

In [None]:
clusters_filtered_by_ents_reduced = get_cluster_ids(3, doc_vecs_filtered_by_ents_reduced)

In [None]:
cluster_plot(doc_vecs_filtered_by_ents_reduced, 2, clusters_filtered_by_ents_reduced)

In [None]:
cluster_plot(doc_vecs_filtered_by_ents_reduced, 3, clusters_filtered_by_ents_reduced)

In [None]:
clusters_filtered_by_ents_reduced_list = list(clusters_filtered_by_ents_reduced)
len(clusters_filtered_by_ents_reduced_list)

In [None]:
cluster0_terms = set()
cluster1_terms = set()
cluster2_terms = set()
for idx, cluster in enumerate(clusters_filtered_by_ents_reduced_list):
    ents = corpus_filtered_by_ents_reduced[idx]
    if cluster == 0:
        relevant_set = cluster0_terms
    elif cluster == 1:
        relevant_set = cluster1_terms
    elif cluster == 2:
        relevant_set = cluster2_terms
    for ent in ents:
        relevant_set.add(ent)

In [None]:
sorted(cluster0_terms)

In [None]:
sorted(cluster1_terms)

In [None]:
sorted(cluster2_terms)

In [None]:
corpus_filtered_by_ents_for_tfidf_reduced = [' '.join((token.replace(' ', '_') for token in doc)) for doc in corpus_filtered_by_ents_reduced]

In [None]:
# Now get vecs with bag of words method. Build vocab from ents
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus_filtered_by_ents_for_tfidf_reduced)

X.shape

In [None]:
terms = vectorizer.get_feature_names()

In [None]:
%matplotlib inline

kmeans_plot(X.todense(), range(1,40))

In [None]:
from sklearn.cluster import KMeans
num_clusters = 2
km = KMeans(n_clusters=num_clusters)
km.fit(X)
clusters = km.labels_.tolist()

In [None]:
cluster_plot(X.todense(), 2, clusters)