# Basic Plots

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from misc_util.logutils import setup_logging
from misc_util.pretty_print import Markdown, display

from derive_conceptualspace.util.mpl_tools import show_hist
from derive_conceptualspace.pipeline import SnakeContext, load_envfiles,  load_lang_translate_files
from derive_conceptualspace.create_spaces.preprocess_descriptions import preprocess_descriptions_full
from fb_classifier.preprocess_data import make_classifier_dict, make_classifier_class

flatten = lambda l: [item for sublist in l for item in sublist]
plt.rcParams['figure.figsize'] = [16, 10]

In [None]:
setup_logging()
load_envfiles("siddata")
ctx = SnakeContext.loader_context(config=dict(MIN_WORDS_PER_DESC=0), silent=False)

In [None]:
raw_descriptions = ctx.p.load(ctx.get_config("raw_descriptions_file"), "raw_descriptions")
languages, translations = load_lang_translate_files(ctx, ctx.p, ctx.get_config("pp_components"))
descriptions, _ = preprocess_descriptions_full(raw_descriptions, ctx.obj["dataset_class"], ctx.get_config("pp_components"), "de", 
                             ctx.get_config("translate_policy"), languages, verbose=False)

# Words per Description

In [None]:
%%capture
fig, ax = show_hist([i.n_words() for i in descriptions._descriptions], "Words per Description", xlabel="Number of Words", cutoff_percentile=98, no_plot=False)

In [None]:
ax.set_ylabel("Count of Descriptions", fontsize=26)
ax.set_xlabel("Number of Words", fontsize=26)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=20)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=20)
ax.grid()
ax.set_title("");

In [None]:
with PdfPages("/home/chris/Documents/UNI_neu/Masterarbeit/MastersThesisText/graphics/dataset_new/words_per_desc.pdf") as pdf:
    plt.show()
    pdf.savefig(fig, bbox_inches='tight')
    
fig

# #Descriptions per Fachbereich

In [None]:
veranst_nums = [eval(i._additionals.get("veranstaltungsnummer")) or None for i in descriptions._descriptions]
new_dset = make_classifier_dict(dict(enumerate(veranst_nums)))
usables = {k: [int(v) for v in vs if v != "other" and int(v) <= 10] for k, vs in new_dset.items() if vs != "other"}
usables = {k: v for k, v in usables.items() if v and any(i is not None for i in v)}
print(f"Dropped {len(new_dset)-len(usables)}/{len(new_dset)} ({(len(new_dset)-len(usables))/len(new_dset)*100:.2f}%) courses")
counter = {str(k): v for k, v in sorted(Counter(flatten([i for i in usables.values()])).items(), key=lambda x:int(x[0]))}
#counter["Other"] = len(new_dset)-len(usables)
fig = plt.bar(*list(zip(*counter.items())))
plt.title(f"Number of Courses per Faculty");