In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from stop_words import get_stop_words

### Config

In [None]:
plt.style.use("seaborn-whitegrid")
plt.rc('ps',fonttype = 42)
plt.rc('pdf',fonttype = 42)
plt.rcParams.update({'font.size': 20})
plt.rcParams['ps.useafm'] = True
plt.rcParams['pdf.use14corefonts'] = True
plt.rcParams['axes.unicode_minus'] = False

### Load data

In [None]:
texts_df = pd.read_pickle("data/processed/texts.p")

### Prepare data

#### Fix authors

In [None]:
author_fix = {
    "Bachiller D. P. Gatell": "Bachiller D. P. Gatell.",
    "Eliza Haywood": "Eliza Fowler Haywood",
}
texts_df["author"] = texts_df["author"].replace(author_fix)

In [None]:
text_by_file_df = texts_df.groupby(["filename", "author", "language"])["text"].apply(lambda x: " ".join(x)).to_frame()

In [None]:
#set.union(*texts_df.groupby("filename")["topics"].agg(lambda x: set.intersection(*x)).tolist())
text_by_file_df["topics"] = texts_df.groupby(["filename", "author", "language"])["topics"].agg(lambda x: set.intersection(*x))

In [None]:
text_by_file_df

### Topic Modeling

In [None]:
languages = ["German"]

In [None]:
def display_topics(model, feature_names, num_top_words):
    for idx, topic in enumerate(model.components_):
        print("Topic {}:".format(idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
        print("---------------")

In [None]:
for lang in languages:
    lang_texts_df = text_by_file_df.loc[text_by_file_df.index.map(lambda x: x[2] == lang)]
    
    lang_topics = set.union(*lang_texts_df["topics"])
    num_topics = 10#len(lang_topics)
    
    stop_words = get_stop_words(lang.lower())
    
    tf_vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words=stop_words)
    tf = tf_vectorizer.fit_transform(lang_texts_df["text"].tolist())
    tf_feature_names = tf_vectorizer.get_feature_names()
    
    lda = LatentDirichletAllocation(n_components=num_topics, learning_method="online", learning_offset=50).fit(tf)
    
    num_top_words = 15
    display_topics(lda, tf_feature_names, num_top_words)