# Topic modelling
* load an existing corpus

In [None]:
import os
import json

import pandas as pd

from lxml import etree as ET

import textacy.tm
import textacy.vsm
from textacy import Corpus

from config import TEI_DIR, PATTERNS
from teipy import TeiReader

In [None]:
corpus = Corpus.load("de_core_news_sm", 'corpus.corpus')

In [None]:
vectorizer = textacy.vsm.Vectorizer(
    tf_type="linear", apply_idf=True, idf_type="smooth", norm="l2",
    min_df=3, max_df=0.95, max_n_terms=100000)

In [None]:
doc_term_matrix = vectorizer.fit_transform(
    (doc._.to_terms_list(ngrams=1, entities=True, as_strings=True) for doc in corpus)
)

In [None]:
model = textacy.tm.TopicModel("nmf", n_topics=12)

In [None]:
model.fit(doc_term_matrix)

In [None]:
doc_topic_matrix = model.get_doc_topic_matrix(doc_term_matrix)

## write document-topic-matrix into a dataframe
* rows: documents
* columns: topics

In [None]:
df = pd.DataFrame(doc_topic_matrix, index=[x._.meta["id"] for x in corpus]).fillna(value=0)

### transform dataframe into a json objects to use with HighCharts
* and store in in ../cache/doc-topic-matrix.json

In [None]:
items = []
row_count = 0
for i, row in df.T.iterrows():
    cell_count = 0
    for c in row:
        cell_count += 1
        record = [row_count, cell_count, int(c*1000)]
        items.append(record)
    row_count += 1
data = {
    "items": items,
    "docs": list(df.index),
    "topics": [f"topic {x}" for x in list(range(12))]
}

In [None]:
try:
    os.makedirs('../cache')
except OSError as e:
    print('../chache alredy exists')

In [None]:
with open('../cache/doc-topic-matrix.json', 'w') as outfile:
    json.dump(data, outfile)