In [None]:
import glob
import itertools
import json

import pandas as pd

from lxml import etree as ET

import textacy.tm
import textacy.vsm
from textacy import Corpus

from config import TEI_DIR, PATTERNS
from teipy import TeiReader

In [None]:
files = glob.glob("../dhd_*/TEI/*.xml")

In [None]:
all_docs = []
for x in files:
    doc = TeiReader(x)
    all_docs.append(doc.extract_md())

In [None]:
df = pd.DataFrame(all_docs)

In [None]:
df

In [None]:
corpus = Corpus(
    "de_core_news_sm",
    data=[
        (
            row['fulltext'],
            {
                "id":
                row['xml_id'],
                "title":
                row['title'],
                "year":
                row['title'][-4:]
            }
        ) for i, row in df.iterrows()
    ]
)

In [None]:
corpus.save('corpus.corpus')

# start here for analyzing the corpus

In [None]:
corpus = Corpus.load("de_core_news_sm", 'corpus.corpus')

In [None]:
vectorizer = textacy.vsm.Vectorizer(
    tf_type="linear", apply_idf=True, idf_type="smooth", norm="l2",
    min_df=3, max_df=0.95, max_n_terms=100000)

In [None]:
doc_term_matrix = vectorizer.fit_transform(
    (doc._.to_terms_list(ngrams=1, entities=True, as_strings=True) for doc in corpus)
)

In [None]:
model = textacy.tm.TopicModel("nmf", n_topics=12)

In [None]:
model.fit(doc_term_matrix)

In [None]:
doc_topic_matrix = model.get_doc_topic_matrix(doc_term_matrix)

In [None]:
df = pd.DataFrame(doc_topic_matrix, index=[x._.meta["id"] for x in corpus])

In [None]:
list(df.index)

In [None]:
items = []
row_count = 0
for i, row in df.iterrows():
    cell_count = 0
    for c in row:
        cell_count += 1
        record = [row_count, cell_count, c]
        items.append(record)
    row_count += 1

In [None]:
data = {
    "items": items,
    "docs": list(df.index),
    "topics": [f"topic {x}" for x in list(range(12))],
}

In [None]:
topic_terms = []
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, topics=list(range(12))):
    topic_terms.append({f"topic {topic_idx}": top_terms})

In [None]:
data["topic_terms"] = topic_terms

In [None]:
with open('doc_topic_matrix.json', 'w') as outfile:
    json.dump(data, outfile)

In [None]:
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, topics=list(range(12))):
    print(topic_idx)
    for j in top_docs:
        print(corpus[j]._.meta["title"])