In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
!test -f newsticker2019.db || wget https://datanizing.com/bert/heise/newsticker2019.db

In [None]:
import sqlite3
sql = sqlite3.connect("newsticker2019.db")

In [None]:
!pip install pytorch_pretrained_bert
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# uncomment for GPU
#n_gpu = torch.cuda.device_count()
#torch.cuda.get_device_name(0)

In [None]:
texts = pd.read_sql("SELECT text, time FROM news", sql, parse_dates=["time"])

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

## Tokenization, model evaluation

In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-multilingual-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

In [None]:
from tqdm import tqdm
tokenized_texts = []
indexed_tokens = []
encoded_layers = []
for text in tqdm(texts["text"]):
    marked_text = "[CLS] " + str(text) + " [SEP]"
    tokenized = tokenizer.tokenize(marked_text)
    tokenized_texts.append(tokenized)
    indexed = tokenizer.convert_tokens_to_ids(tokenized)
    indexed_tokens.append(indexed)
    # segments => all in one sentence
    segments_ids = [1] * len(tokenized)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed])
    segments_tensors = torch.tensor([segments_ids])
    # Predict hidden states features for each layer
    with torch.no_grad():
        el, _ = model(tokens_tensor, segments_tensors)
    encoded_layers.append(el)

## BERT embedding für jede Headline berechnen

In [None]:
headline_embeddings = []
for e in encoded_layers:
    token_vecs = e[11][0]
    embedding = torch.mean(token_vecs, dim=0)
    headline_embeddings.append(embedding.numpy())

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# Cosinus-Ähnlichkeit aller Dokumente miteinander berechnen
sim = cosine_similarity(headline_embeddings, headline_embeddings)

In [None]:
# Ergebnis ist eine Matrix
sim.shape

In [None]:
# Größter Wert wird für identische Meldungen gefunden
sim.max()

In [None]:
for threshhold in [0.7, 0.8, 0.9, 0.95]:
    s = sim.copy()
    # alle über dem Threshhold ausmaskieren
    s[s > threshhold] = 0
    (max_a, max_b) = np.unravel_index(np.argmax(s), s.shape)
    print("%f '%s' '%s'" % (threshhold, texts.iloc[max_a]["text"], texts.iloc[max_b]["text"]))

In [None]:
# Alternative Aufgabe: Ähnlichste Nachrichten in unterschiedlichen Monaten finden