### Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Config

In [None]:
plt.style.use("seaborn-whitegrid")
plt.rc('ps',fonttype = 42)
plt.rc('pdf',fonttype = 42)
plt.rcParams.update({'font.size': 20})
plt.rcParams['ps.useafm'] = True
plt.rcParams['pdf.use14corefonts'] = True
plt.rcParams['axes.unicode_minus'] = False

### Load data

In [None]:
texts_df = pd.read_pickle("data/processed/texts.p")

In [None]:
texts_df.loc[0]

In [None]:
texts_df.groupby("filename").apply(lambda x: " ".join(x["text"]))

In [None]:
len(texts_df[texts_df["places"].apply(lambda x: True if len(x) > 0 else False)]["filename"].unique())

In [None]:
texts_df.groupby(["title", "author", "volume", "issue"])["language"].unique()[texts_df.groupby(["title", "author", "volume", "issue"])["language"].unique().apply(lambda x: True if len(x) == 2 else False)]

### Calculate statistics

#### Languages

In [None]:
ax = texts_df.groupby(["title", "author", "volume", "issue"])["language"].unique().value_counts().plot(kind="bar", figsize=(5, 7))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center", fontsize=10)
plt.tight_layout()
plt.savefig("results/plots/languages.pdf")
plt.close()

#### Countries

In [None]:
ax = texts_df.groupby(["title", "author", "volume", "issue"])["country"].unique().value_counts().plot(kind="bar", figsize=(25, 5))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center", fontsize=10)
plt.tight_layout()
plt.savefig("results/plots/countries.pdf")
plt.close()

#### Topics

In [None]:
topics_count = pd.Series()
for topics in texts_df.drop_duplicates(subset=["filename"])["topics"]:
    for topic in topics:
        if topic not in topics_count:
            topics_count[topic] = 1
        else:
            topics_count[topic] += 1

In [None]:
ax = topics_count.plot(kind="bar", figsize=(25, 7))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center", fontsize=10)
plt.tight_layout()
plt.savefig("results/plots/topics.pdf")
plt.close()

### Date

In [None]:
ax = texts_df.groupby("filename")["date"].unique().value_counts().plot(kind="bar", figsize=(80, 5))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center", fontsize=10)

#### Authors

In [None]:
ax = texts_df.groupby("filename")["author"].unique().value_counts().plot(kind="bar", figsize=(25, 25))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center", fontsize=10)
plt.tight_layout()
plt.savefig("results/plots/authors.pdf")
plt.close()

#### Text length

In [None]:
texts_df.groupby("filename")["text"].apply(lambda x: " ".join(x)).apply(lambda x: len(x)).plot(kind="hist")
plt.tight_layout()
plt.savefig("results/plots/textlength.pdf")
plt.close()

In [None]:
texts_df.groupby("filename")["text"].apply(lambda x: " ".join(x)).apply(lambda x: len(x)).max()

#### Which author where

In [None]:
a_c_table = pd.pivot_table(texts_df.drop_duplicates(subset=["filename"]).groupby(["author", "country"]).size().reset_index(), index=["author"], columns=["country"], values=0)

In [None]:
plt.figure(figsize=(40,40))
sns.heatmap(a_c_table, annot=False)
#plt.show()
plt.tight_layout()
plt.savefig("results/plots/authors_to_countries.pdf")
plt.close()

#### Which topic where

In [None]:
countries = texts_df["country"].to_frame()
topics = texts_df["topics"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")

In [None]:
t_c_df = pd.merge(topics, countries, left_index=True, right_index=True)
t_c_df.columns = ["topic", "country"]

In [None]:
t_c_table = pd.pivot_table(t_c_df.groupby(["topic", "country"]).size().reset_index(), index=["topic"], columns=["country"], values=0)

In [None]:
plt.figure(figsize=(40,20))
sns.heatmap(t_c_table, annot=False)
#plt.show()
plt.tight_layout()
plt.savefig("results/plots/topics_to_countries.pdf")
plt.close()

#### Which auhtor which language

In [None]:
a_l_table = pd.pivot_table(texts_df.drop_duplicates(subset=["filename"]).groupby(["author", "language"]).size().reset_index(), index=["author"], columns=["language"], values=0)

In [None]:
plt.figure(figsize=(40,40))
sns.heatmap(a_l_table, annot=False)
#plt.show()
plt.tight_layout()
plt.savefig("results/plots/authors_to_languages.pdf")
plt.close()

#### NDE

In [None]:
texts_df.groupby("nde").size().plot(kind="bar")
plt.tight_layout()
plt.savefig("results/plots/ndes.pdf")
plt.close()

#### NDF

In [None]:
texts_df.groupby("ndf").size().plot(kind="bar")
plt.tight_layout()
plt.savefig("results/plots/ndfs.pdf")
plt.close()

#### NDE to NDF

In [None]:
nde_ndf_table = pd.pivot_table(texts_df.groupby(["nde", "ndf"]).size().reset_index(), index=["ndf"], columns=["nde"], values=0)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(nde_ndf_table, annot=False)
#plt.show()
plt.tight_layout()
plt.savefig("results/plots/ndfs_to_ndes.pdf")
plt.close()

#### Author to NDF

In [None]:
author_ndf_table = pd.pivot_table(texts_df.groupby(["author", "ndf"]).size().reset_index(), index=["author"], columns=["ndf"], values=0)

In [None]:
plt.figure(figsize=(40,30))
sns.heatmap(author_ndf_table, annot=False)
#plt.show()
plt.tight_layout()
plt.savefig("results/plots/authors_to_ndfs.pdf")
plt.close()