In [None]:
import matplotlib.pyplot as plt

from derive_conceptualspace.pipeline import SnakeContext, load_envfiles
from misc_util.logutils import setup_logging
from misc_util.pretty_print import display
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs
from derive_conceptualspace.util.desc_object import DescriptionList
from derive_conceptualspace.pipeline import cluster_loader

plt.rcParams['figure.figsize'] = [16, 10]

In [None]:
setup_logging()
load_envfiles("siddata")
configs, print_cnf = getfiles_allconfigs("filtered_dcm", verbose=True)

In [None]:
# def filter_conf(conflist, restrictions=None):
#     restrictions = restrictions or (lambda x: True)
#     return [elem for elem in conflist if restrictions(elem)]

# ctx2 = SnakeContext.loader_context(config=filter_conf(configs, restrictions=lambda x: x["embed_dimensions"] == 50)[0])

In [None]:
ctx = SnakeContext.loader_context(config=configs[0], silent=False)

In [None]:
descriptions, filtered_dcm = ctx.load("pp_descriptions", "filtered_dcm", # "embedding", "clusters"
                        loaders=dict(pp_descriptions=DescriptionList.from_json, clusters=cluster_loader, embedding=lambda **args: args["embedding"].embedding_))

In [None]:
display(ctx.display_output("pp_descriptions"))

In [None]:
# filtered_dcm.show_info(descriptions=descriptions)

In [None]:
from misc_util.pretty_print import pretty_print as print
from derive_conceptualspace.util.mpl_tools import show_hist
import numpy as np
from tqdm import tqdm
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
self = filtered_dcm
occurs_in = [set(j[0] for j in i) if i else set() for i in self.dtm]
num_occurences = [sum([term_ind in i for i in occurs_in]) for term_ind in tqdm(range(len(self.all_terms)), desc="Counting Occurences [verbose]")]

In [None]:
counts = pd.Series(np.array(num_occurences))
aggs = counts.agg((min, np.mean, np.median, min), axis="rows")
aggs = pd.concat((aggs, counts.quantile([.05, .95])))
aggs.astype(int)

In [None]:
%%capture
fig, ax = show_hist(num_occurences, f"Docs per Candidate ({self.n_docs} docs, {len(self.all_terms)} terms)", xlabel="#Texts containing a Candidate", 
                    ylabel="Candidate-count (log scale)", cutoff_percentile=97, no_plot=False, log=True,
                    fig_kwargs=dict(figsize=(16, 6)))

In [None]:
print(f"Docs per Candidate ({self.n_docs} docs, {len(self.all_terms)} terms)")
ax.get_xticklabels()[-1].set_text(ax.get_xticklabels()[-1].get_text().replace("-","-\n"))
ax.set_ylabel(ax.get_ylabel(), fontsize=22)
ax.set_xlabel(ax.get_xlabel(), fontsize=24)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=24)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=24)
ax.grid()
ax.set_title("")
with PdfPages("/home/chris/Documents/UNI_neu/Masterarbeit/MastersThesisText/graphics/dataset_new/docs_per_phrase.pdf") as pdf:
    plt.show()
    pdf.savefig(fig, bbox_inches='tight')
    
fig

In [None]:
for thresh in [25, 50, 100]:
    above_threshold = len([i for i in num_occurences if i>= thresh])
    sorted_canditerms = sorted([[ind, elem] for ind, elem in enumerate(num_occurences)], key=lambda x:x[1], reverse=True)
    print(f"Found {len(self.all_terms)} candidate Terms, {above_threshold} ({round(above_threshold/len(self.all_terms)*100)}%) of which occur in at least {thresh} descriptions.")
    print("The 25 terms that occur in the most descriptions (incl the #descriptions they occur in):",
          ", ".join([f"{self.all_terms[ind]} ({occs})" for ind, occs in sorted_canditerms[:25]]))
    max_ind = np.unravel_index(self.as_csr().argmax(), self.as_csr().shape)
    print(f"Max value: Term *b*{self.all_terms[max_ind[0]]}*b* has value *b*{dict(self.dtm[max_ind[1]])[max_ind[0]]:.3f}*b* for doc *b*{descriptions._descriptions[max_ind[1]].title}*b*")
    print("\n\n")

<br><br><br><br>
# Checking Frequencies of unprocessed & processed texts:

In [None]:
terms = ["computer", "mathe", "mathematik", "wissenschaft"]
for term in terms:
    print(f"TF of `{term.ljust(max(len(i) for i in terms))}`: {filtered_dcm.term_freq(term, relative=True):.2%} | {filtered_dcm.term_freq(term):.0f}")

In [None]:
for term in ["mathematik", "mathe"]:
    print(f"Checking Frequencies for {term}")
    print("  Using Description.contains:", sum([term in i for i in descriptions._descriptions]))
    print("  Checking if it's in unprocessed-text:", sum([term in i.unprocessed_text.lower() for i in descriptions._descriptions]))
    print()

**Does Lemmatizing make it better?! Would "Synsetizing" help?**