In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import pandas as pd
import numpy as np

pd.options.display.max_colwidth = 1200

In [None]:
df = pd.read_csv('/home/dnk8n/src/clients/internship/internship/data/travel-wiki-extract-full-templates-processed.csv')
df.shape

In [None]:
# # dev
# df = df.sample(50, random_state=42)
# df.shape

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.head(2)

In [None]:
corpus_text = train_df.text.tolist()

In [None]:
print(corpus_text[1])

In [None]:
def friendly_tag_corpus(row):
    doc_categories = row.categories.split('\n')
    doc_title = row.title
    
#     para_tags = []
#     for para in row.words:
#         sent_tags = []
#         for sent in para:
#             sent_tags.append([doc_title, *doc_categories])
#         para_tags.append(sent_tags)
#     return para_tags
    return [doc_title, *doc_categories]

In [None]:
corpus_tags_friendly = train_df[["title", "categories"]].apply(friendly_tag_corpus, axis=1).to_list()

In [None]:
corpus_tags_friendly[1]

In [None]:
def build_tag_id_mapping(corpus_tags):
    tags = list(set(tag for tags in corpus_tags for tag in tags))
    return {tag: idx for idx, tag in enumerate(tags)}

In [None]:
tag_id_mapping = build_tag_id_mapping(corpus_tags_friendly)

In [None]:
id_tag_mapping = {v: k for k,v in tag_id_mapping.items()}

In [None]:
corpus_tags = [[tag_id_mapping[tag] for tag in tags] for tags in corpus_tags_friendly]

In [None]:
import spacy
from spacy.language import Language

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("merge_entities")
nlp.pipe_names

In [None]:
def corpus2tokens(corpus_text, *args, **kwargs):
    return [list(doc2tokens(doc)) for doc in nlp.pipe(corpus_text, *args, **kwargs)]


def doc2tokens(doc):
    for sent in doc.sents:
        tokened_sent = sent2tokens(sent)
        if tokened_sent:
            yield tokened_sent


def sent2tokens(sent):
    return [token.text.lower() for token in sent if not (token.is_punct or token.is_space)]

In [None]:
corpus_words = corpus2tokens(corpus_text, batch_size=15, n_process=3)

In [None]:
corpus_words[1]

In [None]:
corpus_tags[1]

In [None]:
[id_tag_mapping[tag] for tag in corpus_tags[1]]

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def gen_tagged_docs(corpus_words, corpus_tags):
    for doc_words, doc_tags in zip(corpus_words, corpus_tags):
        for sent_words in doc_words:
            yield TaggedDocument(sent_words, doc_tags)

In [None]:
train_corpus = list(gen_tagged_docs(corpus_words, corpus_tags))

In [None]:
# model = Doc2Vec(vector_size=300, min_count=2, epochs=100)
model = Doc2Vec(
    vector_size=300,
    epochs=200,
    dm=0,
    min_count=3,
    negative=5,
    hs=0,
    sample=0,
    workers=3
)

In [None]:
%%time
model.build_vocab(train_corpus)

In [None]:
print(f"Word 'airport' appeared {model.wv.get_vecattr('airport', 'count')} times in the training corpus.")

In [None]:
%%time
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
# model.save("./doc2vec.model")

In [None]:
wv = model.wv
# wv.save('./doc2vec.wv')

In [None]:
wv.most_similar("travel")

In [None]:
doc = train_corpus[0].words
print(doc)

# Using words
inferred_vector = model.infer_vector(doc)
sims = model.dv.most_similar([inferred_vector], topn=10)
for doc_id, factor in sims:
    print(factor, id_tag_mapping[doc_id])

print("************")    

# Using doc vector
inferred_vector = model.dv[tag_id_mapping["Donakonda Airport"]]
sims = model.dv.most_similar([inferred_vector], topn=10)
for doc_id, factor in sims:
    print(factor, id_tag_mapping[doc_id])

In [None]:
[id_tag_mapping[tag] for tag in train_corpus[0].tags]

In [None]:
import random

random.seed(42)


ranks = []
second_ranks = []

train_corpus_copy = train_corpus.copy()
random.shuffle(train_corpus_copy)
sample_train_corpus = train_corpus_copy[:50]
for sent_id in range(len(sample_train_corpus)):
    inferred_vector = model.infer_vector(sample_train_corpus[sent_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(train_corpus))
    most_similar_tag_indices = [
        [docid for docid, sim in sims].index(tag)
        for tag in sample_train_corpus[sent_id].tags

    ]
    rank = min(most_similar_tag_indices)
    second_rank = max(most_similar_tag_indices) + 1
    ranks.append(rank)
    second_ranks.append(second_rank)

In [None]:
%matplotlib inline

import collections
import matplotlib.pyplot as plt


counter = collections.Counter(ranks)
sum_0 = sum([v for k, v in counter.items() if k <= 0])
sum_all_else = sum([v for k, v in counter.items() if k > 0])
plt.bar([0,1], [sum_0, sum_all_else])
print([sum_0, sum_all_else])

In [None]:
print('Training example correctly matched (%): ', 100 * sum_0 / (sum_0 + sum_all_else))
print('Training example incorrectly matched (%): ', 100 * sum_all_else / (sum_0 + sum_all_else))

In [None]:
sent_id = 42
sentence = train_corpus[sent_id]
article_tag_id = sentence.tags[0]
inferred_vector = model.infer_vector(sentence.words)
sims = model.dv.most_similar([inferred_vector], topn=len(train_corpus))
print('Document ({} - {}): «{}»\n'.format(id_tag_mapping[article_tag_id], train_df.loc[train_df.title == id_tag_mapping[article_tag_id]]['url'], ' '.join(sentence.words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)

most_similar_tag_indices = [
    [docid for docid, sim in sims].index(tag)
    for tag in train_corpus[sent_id].tags

]

for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('JUST-OUTSIDE-TAGS', max(most_similar_tag_indices) + 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], id_tag_mapping[sims[index][0]]))

In [None]:
# def stream_corpus_to_dicst(corpus):
#     for doc in corpus:
#         yield {
#             'words': doc.words,
#             'tags': doc.tags
#         }

In [None]:
# import simplejson


# def json_save(data, filename):
#     with open(filename, 'w', encoding='utf-8') as f:
#         simplejson.dump(data, f, separators=(',', ':'), iterable_as_array=True)

In [None]:
# json_save(stream_corpus_to_dicst(train_corpus), './doc2vec.corpus.json')

In [None]:
# json_save(tag_id_mapping, './doc2vec.tag_id_mapping.json')

In [None]:
import numpy as np

x = np.stack(
    tuple(
        model.dv[tag]
        for tag in set(
            doc.tags[0]
            for doc in train_corpus
        )
    )
)

In [None]:
doc_tags = set(doc.tags[0] for doc in train_corpus)

In [None]:
len(doc_tags)

In [None]:
len(x)

In [None]:
x.shape

In [None]:
from sklearn.cluster import KMeans


distortions = []
testks = range(10,60, 10)
for k in testks:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(x)
    distortions.append(kmeans.inertia_)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(16,8))
plt.plot(testks, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(x)
clusters = kmeans.predict(x)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(x)
x_pca = pca_result[:, 0]
y_pca = pca_result[:, 1]

In [None]:
cluster_colors = np.array(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
color = cluster_colors[clusters]
#train_df['text'] = train_df.apply(lambda row: str(row.doc_id) + '-' + str(row.sent_id), axis=1)


In [None]:
import bokeh.io
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet

# from bokeh.charts import Donut, HeatMap, Histogram, Line, Scatter, show, output_notebook, output_file
bokeh.io.output_notebook()

In [None]:
#visualize the data using bokeh

# TOOLS = "pan, xpan, ypan, xwheel_pan, ywheel_pan, 
#          wheel_zoom, xwheel_zoom, ywheel_zoom, zoom_in,
#          xzoom_in, yzoom_in, zoom_out, xzoom_out, yzoom_out,
#          click, tap, doubletap, crosshair, box_select,
#          xbox_select, ybox_select, poly_select, lasso_select,
#          box_zoom, xbox_zoom, ybox_zoom, save, undo, redo, reset,
#          help, box_edit, line_edit, point_draw, poly_draw,
#          poly_edit, freehand_draw or hover"

source = ColumnDataSource(dict(x=x_pca, y=y_pca, colur=color))
tools = "pan,wheel_zoom,box_zoom,reset,hover,save"

plot = figure(plot_width=800, plot_height=450, tools=tools)

#draw circles
plot.circle(y='y', x='x', source=source, size=15, fill_color='color')
show(plot)