# Lengths of the associated Bags-of-Words

In [None]:
from functools import partial

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from tqdm import tqdm

from misc_util.pretty_print import Markdown
from derive_conceptualspace.util.mpl_tools import show_hist
from derive_conceptualspace.load_data.load_semanticspaces import load_ppmi_weighted_feature_vectors

plt.rcParams['figure.figsize'] = [16, 8]
largedfstyle = [dict(selector="th", props= [('font-size', '18px')]), dict(selector="td", props= [('font-size', '18px')])]
flatten = lambda l: [item for sublist in l for item in sublist] 

## Places
### Words overall

In [None]:
data_base = "/home/chris/Documents/UNI_neu/Masterarbeit/data_new/semanticspaces/"
vecs = load_ppmi_weighted_feature_vectors(data_base, "places")
counts = pd.DataFrame({k: {"unique words": len(v), "words": sum(v.values())} for k, v in vecs.items()}).T

In [None]:
all_words = set(flatten([set(k.keys()) for k in vecs.values()]))
summed_counts = {i: {"df": 0, "stf": 0} for i in all_words}
for v in tqdm(vecs.values()):
    for word, occs in v.items():
        summed_counts[word]["stf"] += occs
        summed_counts[word]["df"] += 1
words_of_len = {key: {nwords: len({k: v for k, v in summed_counts.items() if v[key] >= nwords}) for nwords in [1, 2, 5, 10, 25, 50, 100, 500, 1000, len(vecs)//10, len(vecs)//4]} for key in ["df", "stf"]}
display(Markdown("### Number of words that have a df or stf of at least..."))
pd.DataFrame(words_of_len).T.style.set_table_styles(largedfstyle)

In [None]:
%%capture
fig, ax = show_hist(counts["words"], cutoff_percentile=90, zero_bin=True, ylabel="unique words")

In [None]:
ax.set_ylabel("Number of Entities", fontsize=26)
ax.set_xlabel("Words", fontsize=26)
ax.get_xticklabels()[-1].set_text(ax.get_xticklabels()[-1].get_text().replace("-","-\n"))
ax.set_xticklabels(ax.get_xticklabels(), fontsize=22)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=22)
ax.grid()
ax.set_title("");

In [None]:
with PdfPages("/home/chris/Documents/UNI_neu/Masterarbeit/MastersThesisText/graphics/figures/placetypes_dist.pdf") as pdf:
    plt.show()
    pdf.savefig(fig, bbox_inches='tight')
    
fig

<br><br> 
### Unique words

In [None]:
%%capture
fig, ax = show_hist(counts["unique words"], cutoff_percentile=92, zero_bin=True, ylabel="unique words")

In [None]:
ax.set_ylabel("Number of Entities", fontsize=26)
ax.set_xlabel("Unique Words", fontsize=26)
ax.get_xticklabels()[-1].set_text(ax.get_xticklabels()[-1].get_text().replace("-","-\n"))
ax.set_xticklabels(ax.get_xticklabels(), fontsize=22)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=22)
ax.grid()
ax.set_title("");

In [None]:
with PdfPages("/home/chris/Documents/UNI_neu/Masterarbeit/MastersThesisText/graphics/figures/placetypes_dist_unique.pdf") as pdf:
    plt.show()
    pdf.savefig(fig, bbox_inches='tight')
    
fig

In [None]:
aggs = counts.agg((min, np.mean, np.median, min), axis="rows")
aggs = pd.concat((aggs, counts.quantile([.05, .95])))
aggs.astype(int).style.set_table_styles(largedfstyle)

<br><br><br><br> 
## Movies

In [None]:
data_base = "/home/chris/Documents/UNI_neu/Masterarbeit/data_new/semanticspaces/"
vecs = load_ppmi_weighted_feature_vectors(data_base, "movies")

In [None]:
all_words = set(flatten([set(k.keys()) for k in vecs.values()]))
summed_counts = {i: {"df": 0, "stf": 0} for i in all_words}
for v in tqdm(vecs.values()):
    for word, occs in v.items():
        summed_counts[word]["stf"] += occs
        summed_counts[word]["df"] += 1
words_of_len = {key: {nwords: len({k: v for k, v in summed_counts.items() if v[key] >= nwords}) for nwords in [1, 2, 5, 10, 25, 50, 100, 500, 1000, len(vecs)//10, len(vecs)//4]} for key in ["df", "stf"]}
display(Markdown("### Number of words that have a df or stf of at least..."))
pd.DataFrame(words_of_len).T.style.set_table_styles(largedfstyle)

In [None]:
vecs = {j: vecs[j] for j in [i[0] for i in sorted({k:sum(v.values()) for k, v in vecs.items()}.items(), key=lambda x:x[1], reverse=True)[:15000]]}
# "we selected the 15.000 movies whose associated reviews contained the highest number of words" - when I do it like this it is consistent with the rest of the data
counts = pd.DataFrame({k: {"unique words": len(v), "words": sum(v.values())} for k, v in vecs.items()}).T

In [None]:
aggs = counts.agg((min, np.mean, np.median, min), axis="rows")
aggs = pd.concat((aggs, counts.quantile([.05, .95])))
aggs.astype(int).style.set_table_styles(largedfstyle)