In [11]:
# enable autocomplete
%config IPCompleter.greedy=True

In [12]:
import json
with open('articles.json', 'r') as f:
    articles = json.load(f)

In [13]:
import logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO)

print(articles[1].keys())
print(articles[1]['url'])

dict_keys(['feed', 'source_id', 'url', 'title', 'top_image', 'movies', 'keywords', 'tags', 'authors', 'publish_date', 'summary', 'html', 'meta_data', 'language', 'text', 'vector', 'vector_2d'])
https://blog.fefe.de/?ts=a5add387


In [14]:
articles_by_language = {}

for article in articles:
    language = article.get('language', 'none')
    if not language:
        #print(article['text'][:100])
        pass
    articles_in_language = articles_by_language.get(language, False)
    if not articles_in_language:
        articles_by_language[language] = []
    articles_by_language[language].append(article)

articles_by_url = {article['url']: article for article in articles}
    
# sort
for language in articles_by_language.keys():
    articles_by_language[language] = sorted(articles_by_language[language], 
                                            key=lambda article: article['publish_date'])
    
for language in articles_by_language.keys():
    print(f'{language}: {len(articles_by_language[language])}')

de: 12354
en: 1820
: 15


In [15]:

from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec

def preprocess_text(text):
    text = text.lower()

    # remove whitespaces
    text = ' '.join(text.strip().split())

    text = text.replace('ä', 'ae') \
        .replace('ö', 'oe') \
        .replace('ü', 'ue')

    return text.split('.')


def train_model(all_docs, vec_size=100, alpha=0.025, min_alpha=0.00025, min_count=2, epochs=40):
    tagged_data = [TaggedDocument(words=word_tokenize(' '.join(preprocess_text(doc['text']))),
                                  tags=[doc['url']]) for doc in all_docs if doc.get('text', False)]
    logging.info(f'tagged {len(tagged_data)} of {len(all_docs)} docs')
    
    model = Doc2Vec(vector_size=vec_size,
                    alpha=alpha,
                    min_alpha=min_alpha,
                    min_count=min_count,
                    epochs=epochs)

    model.build_vocab(tagged_data)
    logging.info('training....')
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model


In [16]:
# needed on first run
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
%%time
#german_articles = articles_by_language['de']

DIMENSIONS = 100
model = train_model(articles_by_language['de'], vec_size=DIMENSIONS, min_count=20)


2018-09-18 21:42:34,724 INFO:tagged 12138 of 12354 docs
2018-09-18 21:42:34,724 INFO:collecting all words and their counts
2018-09-18 21:42:34,725 INFO:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-09-18 21:42:35,986 INFO:PROGRESS: at example #10000, processed 4437618 words (3521862/s), 209083 word types, 9995 tags
2018-09-18 21:42:36,281 INFO:collected 235768 word types and 12131 unique tags from a corpus of 12138 examples and 5514303 words
2018-09-18 21:42:36,282 INFO:Loading a fresh vocabulary
2018-09-18 21:42:36,459 INFO:effective_min_count=20 retains 17029 unique words (7% of original 235768, drops 218739)
2018-09-18 21:42:36,459 INFO:effective_min_count=20 leaves 4930654 word corpus (89% of original 5514303, drops 583649)
2018-09-18 21:42:36,568 INFO:deleting the raw counts dictionary of 235768 items
2018-09-18 21:42:36,577 INFO:sample=0.001 downsamples 53 most-common words
2018-09-18 21:42:36,582 INFO:downsampling leaves estimated 3723992 word corpu

2018-09-18 21:43:16,954 INFO:EPOCH 9 - PROGRESS: at 25.09% examples, 794796 words/s, in_qsize 6, out_qsize 0
2018-09-18 21:43:17,964 INFO:EPOCH 9 - PROGRESS: at 46.87% examples, 792344 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:43:18,971 INFO:EPOCH 9 - PROGRESS: at 63.87% examples, 751508 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:43:19,976 INFO:EPOCH 9 - PROGRESS: at 83.56% examples, 757419 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:43:20,890 INFO:worker thread finished; awaiting finish of 2 more threads
2018-09-18 21:43:20,894 INFO:worker thread finished; awaiting finish of 1 more threads
2018-09-18 21:43:20,906 INFO:worker thread finished; awaiting finish of 0 more threads
2018-09-18 21:43:20,907 INFO:EPOCH - 9 : training on 5514303 raw words (3737049 effective words) took 5.0s, 754266 effective words/s
2018-09-18 21:43:21,930 INFO:EPOCH 10 - PROGRESS: at 17.75% examples, 522486 words/s, in_qsize 6, out_qsize 0
2018-09-18 21:43:22,937 INFO:EPOCH 10 - PROGRESS: at 32.86% 

2018-09-18 21:44:07,634 INFO:worker thread finished; awaiting finish of 2 more threads
2018-09-18 21:44:07,638 INFO:worker thread finished; awaiting finish of 1 more threads
2018-09-18 21:44:07,645 INFO:worker thread finished; awaiting finish of 0 more threads
2018-09-18 21:44:07,646 INFO:EPOCH - 17 : training on 5514303 raw words (3734975 effective words) took 4.9s, 764381 effective words/s
2018-09-18 21:44:08,655 INFO:EPOCH 18 - PROGRESS: at 23.35% examples, 733588 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:44:09,659 INFO:EPOCH 18 - PROGRESS: at 44.43% examples, 747993 words/s, in_qsize 5, out_qsize 1
2018-09-18 21:44:10,673 INFO:EPOCH 18 - PROGRESS: at 64.21% examples, 755022 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:44:11,677 INFO:EPOCH 18 - PROGRESS: at 83.69% examples, 758323 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:44:12,530 INFO:worker thread finished; awaiting finish of 2 more threads
2018-09-18 21:44:12,539 INFO:worker thread finished; awaiting finish of 1 more

2018-09-18 21:44:54,437 INFO:EPOCH 27 - PROGRESS: at 68.31% examples, 809159 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:44:55,442 INFO:EPOCH 27 - PROGRESS: at 88.91% examples, 811515 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:44:55,999 INFO:worker thread finished; awaiting finish of 2 more threads
2018-09-18 21:44:56,007 INFO:worker thread finished; awaiting finish of 1 more threads
2018-09-18 21:44:56,018 INFO:worker thread finished; awaiting finish of 0 more threads
2018-09-18 21:44:56,019 INFO:EPOCH - 27 : training on 5514303 raw words (3735566 effective words) took 4.6s, 811660 effective words/s
2018-09-18 21:44:57,035 INFO:EPOCH 28 - PROGRESS: at 25.45% examples, 801646 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:44:58,037 INFO:EPOCH 28 - PROGRESS: at 47.00% examples, 795048 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:44:59,038 INFO:EPOCH 28 - PROGRESS: at 66.39% examples, 785661 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:45:00,040 INFO:EPOCH 28 - PROGRESS: at 86

2018-09-18 21:45:43,499 INFO:EPOCH 36 - PROGRESS: at 96.89% examples, 720648 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:45:43,636 INFO:worker thread finished; awaiting finish of 2 more threads
2018-09-18 21:45:43,652 INFO:worker thread finished; awaiting finish of 1 more threads
2018-09-18 21:45:43,655 INFO:worker thread finished; awaiting finish of 0 more threads
2018-09-18 21:45:43,656 INFO:EPOCH - 36 : training on 5514303 raw words (3736091 effective words) took 5.2s, 722639 effective words/s
2018-09-18 21:45:44,665 INFO:EPOCH 37 - PROGRESS: at 25.09% examples, 793774 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:45:45,680 INFO:EPOCH 37 - PROGRESS: at 46.95% examples, 792457 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:45:46,684 INFO:EPOCH 37 - PROGRESS: at 67.82% examples, 802305 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:45:47,689 INFO:EPOCH 37 - PROGRESS: at 88.28% examples, 805044 words/s, in_qsize 5, out_qsize 0
2018-09-18 21:45:48,270 INFO:worker thread finished; aw

CPU times: user 8min 40s, sys: 8.68 s, total: 8min 49s
Wall time: 4min 4s


In [18]:
len(model.docvecs.vectors_docs[0])

keys = list(model.docvecs.doctags.keys())
vectors = model.docvecs.vectors_docs


In [19]:
%%time
#model.docvecs.vectors_docs[:10]

import numpy as np
from sklearn import cluster
from sklearn import metrics
from scipy.spatial.distance import cdist

def get_centers(model, vectors):
    number_of_clusters = len(np.unique(model.labels_))

    centers = np.zeros((number_of_clusters, DIMENSIONS))
    for i in range(0, number_of_clusters):
        cluster_points = vectors[model.labels_ == i]
        cluster_mean = np.mean(cluster_points, axis=0)
        centers[i, :] = cluster_mean

    return centers, model.labels_ 

NUM_CLUSTERS = 200

clustering = cluster.AgglomerativeClustering(affinity='euclidean', linkage='ward', 
                                             compute_full_tree=True, memory='clustercache')
clustering.set_params(n_clusters=NUM_CLUSTERS)
clustering.fit(vectors)


"""
# k means determine k
distortions = []

step = 100
K = range(2, 6000 + step, step)
for k in K:
    print(k)
    clustering.set_params(n_clusters=k)
    clustering.fit(vectors)
    distortions.append(sum(
        np.min(cdist(vectors, get_centers(clustering, vectors)[0], 'euclidean'), axis=1)) 
                       / vectors.shape[0])

    #kmeanModel = cluster.KMeans(n_clusters=k)
    #kmeanModel.fit(vectors)
    #distortions.append(sum(np.min(cdist(vectors, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / vectors.shape[0])
"""


CPU times: user 10.3 s, sys: 594 ms, total: 10.9 s
Wall time: 10.9 s


In [20]:

##clustering.labels_

In [21]:
import matplotlib.pyplot as plt

def plot_elbow(K, distortions):
    # Plot the elbow
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

2018-09-18 21:46:13,211 INFO:font search path ['/usr/local/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/ttf', '/usr/local/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/afm', '/usr/local/lib/python3.6/site-packages/matplotlib/mpl-data/fonts/pdfcorefonts']
2018-09-18 21:46:13,570 INFO:generated new fontManager


In [22]:
clusters = {}
keys = list(keys)

for idx, label in enumerate(clustering.labels_):
    if not clusters.get(label):
        clusters[label] = []

    clusters[label].append(keys[idx])

print('\n--\n'.join(clusters[1]))


https://www.heise.de/newsticker/meldung/Missing-Link-Verschwoerungstheorien-Karl-Popper-und-die-politische-Diskussion-4117676.html?wt_mc=rss.ho.beitrag.atom
--
https://www.zeit.de/kultur/literatur/2018-08/vidiadhar-surajprasad-naipaul-schriftsteller-literatur-nobelpreis
--
https://www.zeit.de/kultur/literatur/2018-08/vidiadhar-surajprasad-naipaul-schriftsteller-literatur-nobelpreis-nachruf
--
https://www.zeit.de/kultur/literatur/2018-08/james-baldwin-beale-street-blues-roman
--
https://www.zeit.de/politik/ausland/2018-08/kofi-annan-tod-reaktionen-angela-merkel-barack-obama
--
https://www.zeit.de/kultur/musik/2018-08/blood-orange-negro-swan-devonte-hynes-album
--
https://www.zeit.de/kultur/kunst/2018-08/kunsthalle-mannheim-kunst-praesentation-rita-mcbride-jeff-wall
--
https://www.zeit.de/sport/2018-09/julius-thomas-nfl-doktor
--
https://www.zeit.de/sport/2018-09/diego-maradona-trainer-mexiko-sinaloa
--
http://www.spiegel.de/panorama/justiz/u-boot-mord-von-peter-madsen-rechtsmedizinerin-

In [23]:

import matplotlib

# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA


def build_2d_vecs(vectors):
    #pca = PCA(n_components=DIMENSIONS)

    #fiftyDimVecs = pca.fit_transform(vectors)
    tsne = TSNE(n_components=2)

    two_dim_vectors = tsne.fit_transform(vectors)

    #serializable_two_dim_vectors = [vec.tolist() for vec in two_dim_vectors]

    return two_dim_vectors

print('building 2d vecs')

vecs_2d = build_2d_vecs(vectors)


building 2d vecs


This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/usr/local/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/var/task/env/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/var/task/env/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/var/task/env/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 497, in start
    self.io_loop.start()
  File "/var/task/env/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_l

In [24]:
import numpy as np
import bokeh.plotting as bp
import bokeh.models as bmo

from bokeh.plotting import save, show, reset_output
from bokeh.layouts import column

from bokeh.models import HoverTool
from bokeh.palettes import Inferno256, Viridis256, viridis
from bokeh.transform import linear_cmap

from bokeh.models import ColumnDataSource, OpenURL, TapTool
from bokeh.plotting import figure, output_file, show

bp.output_notebook()

def plot_vector(vec_2d, keys, cluster_labels, title):
    centroids = bp.ColumnDataSource({
        'c_x': [d[0] for d in vec_2d],
        'c_y': [d[1] for d in vec_2d],
        'url': [d for d in keys],
        'key': [articles_by_url[d]['text'][:50] for d in keys],
        'cluster_label': cluster_labels
    })

    plot = bp.figure(plot_width=900, plot_height=1200,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave,tap",
                     x_axis_type=None, y_axis_type=None, min_border=1,
                     )

    mapper = linear_cmap(field_name='cluster_label', palette=Inferno256, low=0, high=NUM_CLUSTERS -1)

    plot.scatter(x='c_x', y='c_y',
                 color=mapper,
                 source=centroids,
                 )

    hover = plot.select(dict(type=HoverTool))
    hover.tooltips = [
        ("key", "@key"),
        ("cluster", "@cluster_label"),
    ]

    taptool = plot.select(type=TapTool)
    taptool.callback = OpenURL(url='@url')

    return plot

print('ploting...')
#reset_output()

plot = plot_vector(vecs_2d, keys, clustering.labels_, '%s articles' % len(vecs_2d))
show(plot)


ploting...
