In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [2]:
import json

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import spacy
from spacy.lang.nl.stop_words import STOP_WORDS

from readers import JsonReader
from analysis import TopicDetector

In [None]:
nlp = spacy.load("nl_core_news_sm")
json_reader = JsonReader(source="biomassa.json", subjects=["biomassa"])
texts = json_reader.get_texts()
#from data.biomass import *
#texts = [TEXT_1, TEXT_2, TEXT_3]

In [None]:
def get_word_frame(texts, ngram, sums=True):
    tfidf_vectorizer = TfidfVectorizer(stop_words=STOP_WORDS, ngram_range=(ngram, ngram))
    tfidf_vectorizer.fit(texts)
    feature_names = tfidf_vectorizer.get_feature_names()
    tfidf_vectors = tfidf_vectorizer.transform(texts)
    frame = pd.DataFrame(tfidf_vectors.toarray(), columns=feature_names)
    if ngram == 1:
        number_features = [feature for feature in feature_names if not feature.isalpha()]
        frame.drop(labels=number_features, axis=1, inplace=True)
    return frame.sum(axis=0) if sums else frame

In [None]:
#tfidf_frame = get_word_frame(texts, 1, sums=False)
#tfidf_words_sorted = tfidf_frame.sum(axis=0).sort_values(ascending=False)

tfidf_words_sorted_bi = get_word_frame(texts, 2)
tfidf_words_sorted_bi.sort_values(ascending=False, inplace=True)
#tri_frame = get_word_frame(texts, 3)
#tfidf_words_sorted_tri = tri_frame.sum(axis=0).sort_values(ascending=False)
#tetra_frame = get_word_frame(texts, 4)
#tfidf_words_sorted_tetra = tetra_frame.sum(axis=0).sort_values(ascending=False)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    print(tfidf_words_sorted_bi[:10])

#tfidf_words_sorted_bi
#tfidf_words_sorted_tri
#tfidf_words_sorted_tetra

In [None]:
all_words = tfidf_words_sorted.index.tolist()
all_tokens = list(map(lambda word: nlp.vocab[word], all_words))
most_important_tokens = all_tokens[:50]
for important_token in most_important_tokens:
    similarities = []
    for token in all_tokens:
        if token is important_token:
            continue
        similarities.append((token.text, important_token.similarity(token),))
    similarities = sorted(similarities, key=lambda item: item[1])
    print(important_token.text)
    print("*" * 10 + "most similar" + "*" * 10)
    most_similar = similarities[-5:]
    most_similar.reverse()
    print("\n".join(
        "{0} {1:.2f}".format(word, similarity)
        for word, similarity in most_similar
    ))
    print("*" * 10 + "most different" + "*" * 10)
    print("\n".join(
        "{0} {1:.2f}".format(word, similarity)
        for word, similarity in similarities[:5]
    ))
    print()
    print()

In [None]:
all_words = tfidf_words_sorted.index.tolist()
least_important_words = all_words[500:]
most_important_frame = tfidf_frame.drop(labels=least_important_words, axis=1)

most_important_cooccurence = most_important_frame.T.dot(most_important_frame)
#np.fill_diagonal(most_important_cooccurence.values, 0)
#most_important_cooccurence = most_important_cooccurence.applymap(lambda v: v if v >= 0.3 else 0.0)

most_important_cooccurence

In [None]:
mic_sum = most_important_cooccurence.sum(axis=0).sort_values()
mic_sum

In [None]:
life_text_ixs = tfidf_frame["life"].argsort()[::-1]
justice_text_ixs = tfidf_frame["justice"].argsort()[::-1]
court_text_ixs = tfidf_frame["court"].argsort()[::-1]
law_text_ixs = tfidf_frame["law"].argsort()[::-1]
reward_text_ids = tfidf_frame["reward"].argsort()[::-1]

In [None]:
life_text_ixs = tfidf_frame["life"].argsort()
justice_text_ixs = tfidf_frame["justice"].argsort()
court_text_ixs = tfidf_frame["court"].argsort()
law_text_ixs = tfidf_frame["law"].argsort()
reward_text_ids = tfidf_frame["reward"].argsort()

In [None]:
texts[life_text_ixs.iloc[0]]

In [None]:
texts[justice_text_ixs.iloc[2]]

In [None]:
texts[court_text_ixs.iloc[0]]

In [None]:
texts[law_text_ixs.iloc[2]]

In [None]:
texts[reward_text_ids.iloc[5]]

In [None]:
mins = most_important_cooccurence.min()
nzeros = mins[mins > 0]
frame = most_important_cooccurence.drop(labels=nzeros.index, axis=0)
frame = frame.drop(labels=nzeros.index, axis=1)
frame.shape

In [None]:
nodes = [{"name": column, "group": 0} for column in frame.columns]
node_names = [node["name"] for node in nodes]
links = [{"source": node_names.index(column), "target": node_names.index(key), "value": 1} for column, row in frame.iteritems() for key, value in row.iteritems() if value == 0]
with open("../data/cooccurence-graph.json", "w") as file:
    json.dump({
        "nodes": nodes,
        "links": links
    }, file, indent=4)

In [29]:
td = TopicDetector(lambda text: text)
td.run(texts)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[['op het gebied van', 7.318341094985242],
 ['het gebruik van biomassa', 3.014911840658396],
 ['ministerie van economische zaken', 2.871670107429956],
 ['van het gebruik van', 2.547652273224091],
 ['van tak en tophout', 2.4615774061795843],
 ['met het oog op', 2.2387230921530272],
 ['wageningen ur food biobased', 2.119317704029361],
 ['ur food biobased research', 2.118398452288819],
 ['het ministerie van economische', 2.006552672604752],
 ['biobased research instituut binnen', 1.975358334433946],
 ['alterra rapport 1320', 0.45583501114996805],
 ['alterra rapport 1813', 0.45298737073806383],
 ['special mei 2013', 0.3981506866188229],
 ['ecn 07 015', 0.39739530174594595],
 ['bosbeheerplan ocmw herentals', 0.37825219694423384],
 ['nederlandse economie 2009', 0.3614416575450811],
 ['ecn 07 030', 0.32433651802553753],
 ['bodemlabel advies asr', 0.3212653523344494],
 ['2018transitie agendacirculaire economie', 0.3027969478216],
 ['ocf themarapport toerisme', 0.29477336562363193],
 ['figuur 1

In [None]:
#td.sorted_ngrams[2]

#td.sorted_ngrams[3]
#len(td.sorted_ngrams[4].where(lambda value: value >= 0.95))
td.sorted_ngrams[1]
#minimum = td.sorted_ngrams[4].min()
#maximum = td.sorted_ngrams[4].max()

In [17]:
len(td.sorted_ngrams[4])

14806228

In [22]:
td.sorted_ngrams[4][-1]

4.6788447046272064e-05

In [None]:
drop_index = set()
for ix in range(self.max_ngram, 1, -1):
    drop_index = td._get_drop_index(self.sorted_ngrams[ix].index, drop_index)
    self.sorted_ngrams[ix - 1].drop(labels=drop_index, inplace=True, errors="ignore")

In [None]:
results = []
for ix, serie in td.sorted_ngrams.items():
    results += [(topic, len(topic.split(" ")), importance) for topic, importance in serie[:10].items()]
results.sort(key=lambda result: (result[1], result[2],), reverse=True)
    

In [None]:
results

In [None]:
[
            (topic, importance,)
            for topic, word_count, importance in
            results
        ]