In [1]:
import json
from search_clustering.pipeline import *
from search_clustering.client import *
from search_clustering.preprocessing import *
from search_clustering.embedding import *
from search_clustering.reduction import *
from search_clustering.clustering.spatial import *
from search_clustering.clustering.temporal import *
from search_clustering.labeling import *

query = "energie"
samples = 500
use_cached_data = True

if not use_cached_data:
    es = ElasticClient()

    for index in es.client.indices.get_alias():
        print(index)

    results = es.search(index="faz", field="body", query=query, size=1_000)

    with open(f"datasets/results_{query}.json", "w") as json_file:
        json.dump(results, json_file)

else:
    with open(f"datasets/results_{query}.json", "r") as json_file:
        results = json.load(json_file)
        results = [
            res for res in results if res["_source"]["publication_date"] is not None
        ][:samples]


print(f"{len(results)} results")
# snippets = [res["snippet"] for res in results]
# bodies = [res["_source"]["body"] for res in results]
# titles = [res["_source"]["title"] for res in results]
# timestamps = [res["_source"]["publication_date"] for res in results]

500 results


In [None]:
pipe = SpatialPipeline(
    ParagraphKeywordPreprocessor(),
    SentenceMiniLM("topics"),
    DensMAP(),
    HDBSCAN(),
    Topically(),
)
vecs, clusters, labels = pipe.run(results, visualize=True)

[1/6] Preprocessing


In [None]:
for i in np.where(clusters == -1)[0]:
    print(i, results[i]["_source"]["title"])

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import timedelta

timestamps = [res["_source"]["publication_date"][:10] for res in results]

df = pd.DataFrame(pd.to_datetime(timestamps), columns=["date"])

first_day = df.date.min()
last_day = df.date.max()
timespan = last_day - first_day
interval = timedelta(30)
bins = 100 #timespan // interval
real_interval = timespan / bins

hist = list(pd.np.histogram(df["date"].astype(int), bins=bins, density=False))
diff = abs(np.diff(hist[0]))

plt.bar(range(bins), hist[0])
plt.bar(np.arange(bins - 1) + 1, np.clip(diff - np.mean(diff), 0, np.inf))
plt.show()

In [None]:
tpipe = TemporalPipeline(DummyPreprocessor(), TemporalClustering(target_bins=20, window_size=0), TemporalLabeling())
docs, clusters, labels = tpipe.run(results, verbose=False)
    
colors = []
for c in np.unique(clusters):
    r = len(clusters[clusters == c]) / len(clusters) * bins + .5
    colors += [f"C{c}" for _ in range(round(r))]
plt.bar(range(bins), hist[0], color=colors)
plt.show()

#for label in labels:
#    print(label)

In [None]:
tpipe = TemporalPipeline(DummyPreprocessor(), TemporalClustering(target_bins=10, window_size="auto"), TemporalLabeling())
docs, clusters, labels = tpipe.run(results, verbose=False)
    
colors = []
for c in np.unique(clusters):
    r = len(clusters[clusters == c]) / len(clusters) * bins + 1
    colors += [f"C{c}" for _ in range(int(r))]
plt.bar(range(bins), hist[0], color=colors)
plt.show()

for label in labels:
    print(label)