# Basic Plots

In [None]:
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from tqdm import tqdm

from misc_util.logutils import setup_logging
from misc_util.pretty_print import Markdown, display

from derive_conceptualspace.util.mpl_tools import show_hist
from derive_conceptualspace.pipeline import SnakeContext, load_envfiles,  load_lang_translate_files
from derive_conceptualspace.create_spaces.preprocess_descriptions import preprocess_descriptions_full
from fb_classifier.preprocess_data import make_classifier_dict, make_classifier_class
from derive_conceptualspace.load_data.dataset_specifics.siddata2022 import Dataset

flatten = lambda l: [item for sublist in l for item in sublist]
plt.rcParams['figure.figsize'] = [16, 10]
largedfstyle = [dict(selector="th", props= [('font-size', '18px')]), dict(selector="td", props= [('font-size', '18px')])]

In [None]:
setup_logging()
load_envfiles("siddata")
ctx = SnakeContext.loader_context(config=dict(MIN_WORDS_PER_DESC=0, debug=False), silent=False)

In [None]:
raw_descriptions = ctx.p.load(ctx.get_config("raw_descriptions_file"), "raw_descriptions")
languages, translations = load_lang_translate_files(ctx, ctx.p, ctx.get_config("pp_components"))
descriptions, _ = preprocess_descriptions_full(raw_descriptions, ctx.obj["dataset_class"], ctx.get_config("pp_components"), "de", 
                             ctx.get_config("translate_policy"), languages, verbose=False)

# Words per Description

In [None]:
%%capture
fig, ax = show_hist([i.n_words() for i in descriptions._descriptions], "Words per Description", xlabel="Number of Words", cutoff_percentile=98, no_plot=False, 
                    fig_kwargs=dict(figsize=(16, 6)))

In [None]:
ax.get_xticklabels()[-1].set_text(ax.get_xticklabels()[-1].get_text().replace("-","-\n"))
ax.set_ylabel("Count of Descriptions", fontsize=28)
ax.set_xlabel("Number of Words", fontsize=28)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=24)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=24)
ax.grid()
ax.set_title("");

In [None]:
with PdfPages("/home/chris/Documents/UNI_neu/Masterarbeit/MastersThesisText/graphics/dataset_new/words_per_desc.pdf") as pdf:
    plt.show()
    pdf.savefig(fig, bbox_inches='tight')
    
fig

In [None]:
vecs = {i.title: i.bow() for i in descriptions._descriptions}
all_words = set(flatten([set(k.keys()) for k in vecs.values()]))
summed_counts = {i: {"df": 0, "stf": 0} for i in all_words}
for v in tqdm(vecs.values()):
    for word, occs in v.items():
        summed_counts[word]["stf"] += occs
        summed_counts[word]["df"] += 1
words_of_len = {key: {nwords: len({k: v for k, v in summed_counts.items() if v[key] >= nwords}) for nwords in [1, 2, 5, 10, 25, 50, 100, 500, 1000, len(vecs)//10, len(vecs)//4]} for key in ["df", "stf"]}
display(Markdown("### Number of words that have a df or stf of at least..."))
pd.DataFrame(words_of_len).T.style.set_table_styles(largedfstyle)

In [None]:
vecs = {i.title: i.bow() for i in descriptions._descriptions}
counts = pd.DataFrame({k: {"unique words": len(v), "words": sum(v.values())} for k, v in vecs.items()}).T
aggs = counts.agg((min, np.mean, np.median, min), axis="rows")
aggs = pd.concat((aggs, counts.quantile([.05, .95])))
aggs.astype(int).style.set_table_styles(largedfstyle)

In [None]:
vecs = {i.title: i.bow() for i in descriptions._descriptions if sum(i.bow().values()) >= 80}
counts = pd.DataFrame({k: {"unique words": len(v), "words": sum(v.values())} for k, v in vecs.items()}).T
aggs = counts.agg((min, np.mean, np.median, min), axis="rows")
aggs = pd.concat((aggs, counts.quantile([.05, .95])))
aggs.astype(int).style.set_table_styles(largedfstyle)

# #Descriptions per Fachbereich

In [None]:
#TODO this functionality is also in the Dataset-class of siddata2022, use that one!
veranst_nums = [eval(i._additionals.get("veranstaltungsnummer")) or None for i in descriptions._descriptions]
new_dset = make_classifier_dict(dict(enumerate(veranst_nums)))
usables = {k: [int(v) for v in vs if v != "other" and int(v) <= 10] for k, vs in new_dset.items() if vs != "other"}
usables = {k: v for k, v in usables.items() if v and any(i is not None for i in v)}
print(f"Dropped {len(new_dset)-len(usables)}/{len(new_dset)} ({(len(new_dset)-len(usables))/len(new_dset)*100:.2f}%) courses")
counter = {str(k): v for k, v in sorted(Counter(flatten([i for i in usables.values()])).items(), key=lambda x:int(x[0]))}
#counter["Other"] = len(new_dset)-len(usables)


fig, ax = plt.subplots(figsize=(16,8))
rescale = lambda y, fac=1: (y - np.min(y)) / (np.max(y)*fac - np.min(y))
ax.bar(*list(zip(*counter.items())), color=plt.get_cmap("cividis")(rescale(range(len(counter)),fac=3)))
ax.set_title(f"Number of Courses per Faculty")
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()):
    item.set_fontsize(27)
    
patches = [mpatches.Patch(label=v, color=col) for v, col in zip(Dataset.FB_MAPPER.values(), plt.get_cmap("cividis")(rescale(range(len(counter)),fac=3)))]
plt.legend(handles=patches, fontsize=18, bbox_to_anchor=(1.13, .999), framealpha=1)

with PdfPages("/home/chris/Documents/UNI_neu/Masterarbeit/MastersThesisText/graphics/dataset_new/courses_per_faculty.pdf") as pdf:
    fig.show()
    pdf.savefig(fig, bbox_inches='tight')