In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
!test -f newsticker2019.db || wget https://datanizing.com/bert/heise/newsticker2019.db

In [3]:
import sqlite3
sql = sqlite3.connect("newsticker2019.db")

In [4]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# uncomment for GPU
#n_gpu = torch.cuda.device_count()
#torch.cuda.get_device_name(0)

In [5]:
texts = pd.read_sql("SELECT text, time FROM news", sql, parse_dates=["time"])

In [6]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

## Tokenization, model evaluation

In [7]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-multilingual-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(105879, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Li

In [8]:
from tqdm import tqdm
tokenized_texts = []
indexed_tokens = []
encoded_layers = []
for text in tqdm(texts["text"]):
    marked_text = "[CLS] " + str(text) + " [SEP]"
    tokenized = tokenizer.tokenize(marked_text)
    tokenized_texts.append(tokenized)
    indexed = tokenizer.convert_tokens_to_ids(tokenized)
    indexed_tokens.append(indexed)
    # segments => all in one sentence
    segments_ids = [1] * len(tokenized)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed])
    segments_tensors = torch.tensor([segments_ids])
    # Predict hidden states features for each layer
    with torch.no_grad():
        el, _ = model(tokens_tensor, segments_tensors)
    encoded_layers.append(el)

100%|██████████| 10077/10077 [06:37<00:00, 25.33it/s]


## BERT embedding für jede Headline berechnen

In [9]:
headline_embeddings = []
for e in encoded_layers:
    token_vecs = e[11][0]
    embedding = torch.mean(token_vecs, dim=0)
    headline_embeddings.append(embedding.numpy())

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
# Cosinus-Ähnlichkeit aller Dokumente miteinander berechnen
sim = cosine_similarity(headline_embeddings, headline_embeddings)

In [11]:
# Ergebnis ist eine Matrix
sim.shape

(10077, 10077)

In [12]:
# Größter Wert wird für identische Meldungen gefunden
sim.max()

1.000001

In [13]:
for threshhold in [0.7, 0.8, 0.9, 0.95]:
    s = sim.copy()
    # alle über dem Threshhold ausmaskieren
    s[s > threshhold] = 0
    (max_a, max_b) = np.unravel_index(np.argmax(s), s.shape)
    print("%f '%s' '%s'" % (threshhold, texts.iloc[max_a]["text"], texts.iloc[max_b]["text"]))

0.700000 'Linkflow Fitt360: Die Nackenkamera, die man immer dabei haben kann' 'Großer Lausch-Anruf: Signal für Android nimmt selbsttätig Anrufe an'
0.800000 'Reporter ohne Grenzen: Immer mehr Hetze gegen Medienschaffende in Europa und den USA' 'Missing Link: Auch eine Geschichte der Öffentlichkeit - Habermas und der Maulwurf der Vernunft'
0.900000 'Ins richtige Licht gerückt: Die Bilder der Woche (KW 22)' 'Das Ungewöhnliche sehen: Die Bilder der Woche (KW 23)'
0.950000 'IT-Jobtag in Hamburg: Arbeitgeber treffen am 27. Februar auf Bewerber' 'IT-Jobtag in München: Am 16. Mai treffen Bewerber auf Arbeitgeber'


In [14]:
# Alternative Aufgabe: Ähnlichste Nachrichten in unterschiedlichen Monaten finden