# Statistik & QA

In [None]:
import pandas as pd

In [None]:
!wget https://datanizing.com/heise/newsticker2019.csv

In [None]:
docs = pd.read_csv("newsticker2019.csv", index_col="id", parse_dates=["time"])

In [None]:
docs

In [None]:
docs.set_index("time").resample("W").agg({"headline": "count"}).plot()

In [None]:
import re
docs["full_text"] = docs["headline"] + " " + docs["lead"] + " " + docs["text"]
docs["simple_text"] = docs["full_text"].str.lower().replace(re.compile("[\-,\.\!\?\:\;\"\+\&\'„“–\(\)\[\]]"), " ")
[t[0:120] for t in docs["simple_text"].values[0:10]]

In [None]:
apple = docs[docs["simple_text"].str.contains("apple")]
len(apple)

In [None]:
[t[0:120] for t in apple["simple_text"].values[10:30]]

In [None]:
apple.set_index("time").resample("W").agg({"headline": "count"}).plot()

# Co-Occurrence

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit_transform(docs["simple_text"])

In [None]:
cv = CountVectorizer(min_df=10, max_df=0.3)
cv.fit_transform(docs["simple_text"])

In [None]:
from collections import Counter, defaultdict
from tqdm import tqdm
c = defaultdict(lambda: Counter())
voc = cv.get_feature_names()
window = 5 # sollte ungerade sein
skip = (window - 1) // 2
for doc in tqdm(docs["simple_text"]):
    tokens = doc.split(" ")
    for i, w in enumerate(tokens):
        if w in voc:
            for j in range(max(0, i-skip), i):
                if tokens[j] in voc:
                    c[w][tokens[j]] += 1
            for j in range(i+1, min(i+1+skip, len(tokens))):
                if tokens[j] in voc:
                    c[w][tokens[j]] += 1

In [None]:
c["apple"].most_common(10)

In [None]:
c["iphone"].most_common(5)

In [None]:
c["ipad"].most_common(5)

In [None]:
c["macbook"].most_common(5)

# Embeddings

In [None]:
texts = [t.split(" ") for t in docs["simple_text"]]

## word2vec

### Wörter

In [None]:
from gensim.models import Word2Vec

In [None]:
model_word = Word2Vec(texts, min_count=5, workers=8)

In [None]:
len(model_word.wv.vectors)

In [None]:
model_word.wv.most_similar("apple")

In [None]:
model_word.wv.most_similar("iphone")

In [None]:
model_word.wv.most_similar("ipad")

In [None]:
model_word.wv.most_similar("macbook")

In [None]:
model_word.wv.doesnt_match(["apple", "microsoft", "google", "siemens"])

In [None]:
model_word.wv.doesnt_match(["hund", "katze", "maus", "python"])

Apple - ? = Microsoft - Windows

Apple - Microsoft + Windows = ?

In [None]:
model_word.wv.most_similar(positive=["apple", "windows"], 
                                   negative=["microsoft"],  topn=4)

In [None]:
model_word.wv.most_similar(positive=["microsoft", "iphone"],
                            negative=["apple"], topn=4)

In [None]:
model_word.wv.most_similar(positive=["android", "apple"],
                            negative=["iphone"], topn=4)

### Phrasen

In [None]:
from gensim.models import Phrases

In [None]:
entity_transformer = Phrases(texts)

In [None]:
model_phrase = Word2Vec(entity_transformer[texts], min_count=5, workers=8)

In [None]:
model_phrase.wv.most_similar("apple")

In [None]:
model_phrase.wv.most_similar("iphone")

In [None]:
model_phrase.wv.most_similar("ipad")

In [None]:
model_phrase.wv.most_similar("macbook")

In [None]:
model_phrase.wv.doesnt_match(["apple", "microsoft", "google", "siemens"])

In [None]:
model_phrase.wv.doesnt_match(["hund", "katze", "maus", "python"])

Apple - ? = Microsoft - Windows

Apple - Microsoft + Windows = ?

In [None]:
model_phrase.wv.most_similar(positive=["apple", "windows"], 
                                   negative=["microsoft"],  topn=10)

## fastText

In [None]:
from gensim.models import FastText

In [None]:
model_fast = FastText(texts, min_count=5, workers=8)

In [None]:
model_fast.wv.most_similar("apple", topn=4)

In [None]:
model_fast.wv.most_similar("iphone")

In [None]:
model_fast.wv.most_similar("ipad")

In [None]:
model_fast.wv.most_similar("macbook")

In [None]:
model_fast.wv.doesnt_match(["apple", "microsoft", "google", "siemens"])

In [None]:
model_fast.wv.doesnt_match(["hund", "katze", "maus", "python"])

In [None]:
model_fast.wv.most_similar(positive=["google", "windows"], 
                                   negative=["microsoft"],  topn=10)

## gloVe

In [None]:
!wget https://datanizing.com/heise/glove-w2v.txt

In [None]:
from gensim.models import KeyedVectors
glove = KeyedVectors.load_word2vec_format("glove-w2v.txt")

In [None]:
glove.most_similar("apple", topn=4)

In [None]:
glove.most_similar("iphone")

In [None]:
glove.most_similar("ipad")

In [None]:
glove.most_similar("macbook")

In [None]:
glove.most_similar(positive=["apple", "windows"], 
                                   negative=["microsoft"],  topn=10)

In [None]:
glove.most_similar(positive=["google", "iphone"], 
                                   negative=["apple"],  topn=10)

In [None]:
glove.doesnt_match(["apple", "microsoft", "google", "siemens"])

In [None]:
glove.doesnt_match(["hund", "katze", "maus", "python"])

# Anwendungen

## Semantischer Graph

In [None]:
import networkx as nx

G = nx.Graph()
e = model_word.wv
w0 = "apple"
G.add_node(w0)
for (w1, p1) in e.most_similar(w0, topn=5):
    G.add_node(w1)
    G.add_edge(w0, w1, weight=p1 )
    for (w2, p2) in e.most_similar(w1, topn=5):
        G.add_node(w2)
        G.add_edge(w1, w2, weight=p2 )
        for (w3, p3) in e.most_similar(w2, topn=5):
            G.add_node(w3)
            G.add_edge(w2, w3, weight=p3 )

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
nx.draw(G, with_labels=True, pos=nx.spring_layout(G))

In [None]:
import networkx as nx

G = nx.Graph()
e = model_word.wv
w0 = "apple"
G.add_node(w0)
for (w1, p1) in e.most_similar(w0, topn=10):
    G.add_node(w1)
    G.add_edge(w0, w1, weight=p1 )
    for (w2, p2) in e.most_similar(w1, topn=10):
        G.add_node(w2)
        G.add_edge(w1, w2, weight=p2 )
        for (w3, p3) in e.most_similar(w2, topn=10):
            G.add_node(w3)
            G.add_edge(w2, w3, weight=p3 )

In [None]:
nx.write_gexf(G, "apple-w2v.gexf")

## Semantische Karte

In [None]:
%matplotlib inline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import umap.umap_ as umap 


In [None]:
def plot_2d(vecs, labels, method, filename):
    if method == "tsne":
        tsne = TSNE(n_components=2, random_state=0).fit_transform(vecs)
        df = pd.DataFrame(data = tsne, columns = ["x", "y"])
    elif method == "pca":
        pca = PCA(n_components=2, random_state=0).fit_transform(vecs.toarray())
        df = pd.DataFrame(data = pca, columns = ["x", "y"])
    else:
        um = umap.UMAP(n_components=2, random_state=0).fit_transform(vecs)
        df = pd.DataFrame(data = um, columns = ["x", "y"])
    
    plt.rcParams["figure.figsize"] = (16, 9)
    plt.style.use("seaborn-whitegrid")
    fig, ax = plt.subplots()
    ax.scatter(df.x, df.y, marker="o")

    span = abs(max(df.x) - min(df.x))
    diff = span/150.0
    for i, txt in enumerate(labels):
        fontsize = 10
        ax.annotate(txt, (df.x[i]+diff, df.y[i]), fontsize=fontsize)
    plt.xticks([], [])
    plt.yticks([], [])
    plt.savefig(f'{filename}.png')

In [None]:
words = ["apple", "ipad",  "ios", "iphone", "xs", "xr",
         "mac", "macbook", "air", "osx", "macos",
         "android", "google", "samsung", "galaxy", 
         "huawei", "mate", "honor", "htc", "lg", 
         "windows", "microsoft", "surface",
         "laptop", "notebook", "smartphone", "tablet",
         "app", "store", "play", "music", "itunes",
         "netflix", "disney", "spotify", "streaming"
         "amazon", "facebook" ]
vecs = []
labels = []

text = model_word.wv
for w in words:
    if w in text:
        vecs.append(text[w])
        labels.append(w)

plot_2d(vecs, labels, 'umap', "apple")