# Topic modelling
* load an existing corpus

In [None]:
import os
import json

import pandas as pd

from lxml import etree as ET

import textacy.tm
import textacy.vsm
from textacy import Corpus

from config import TEI_DIR, PATTERNS
from teipy import TeiReader

In [None]:
corpus = Corpus.load("de_core_news_sm", 'corpus.corpus')

In [None]:
vectorizer = textacy.vsm.Vectorizer(
    tf_type="linear", apply_idf=True, idf_type="smooth", norm="l2",
    min_df=3, max_df=0.95, max_n_terms=100000)

In [None]:
doc_term_matrix = vectorizer.fit_transform(
    (doc._.to_terms_list(ngrams=1, entities=True, as_strings=True) for doc in corpus)
)

In [None]:
model = textacy.tm.TopicModel("nmf", n_topics=12)

In [None]:
model.fit(doc_term_matrix)

In [None]:
doc_topic_matrix = model.get_doc_topic_matrix(doc_term_matrix)

## write document-topic-matrix into a dataframe
* rows: documents
* columns: topics

In [None]:
df = pd.DataFrame(doc_topic_matrix, index=[x._.meta["id"] for x in corpus]).fillna(value=0)

### transform dataframe into a json objects to use with HighCharts
* and store in in ../cache/doc-topic-matrix.json

In [None]:
items = []
row_count = 0
for i, row in df.T.iterrows():
    cell_count = 0
    for c in row:
        cell_count += 1
        record = [row_count, cell_count, int(c*1000)]
        items.append(record)
    row_count += 1
data = {
    "items": items,
    "docs": list(df.index),
    "topics": [f"topic {x}" for x in list(range(12))]
}

In [None]:
corpus[0]._.meta['title']

In [None]:
topic_terms = []
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, topics=list(range(12))):
    topic_terms.append({f"topic {topic_idx}": top_terms})

In [None]:
data["topic_terms"] = topic_terms

In [None]:
try:
    os.makedirs('../cache')
except OSError as e:
    print('../chache alredy exists')

In [None]:
with open('../cache/doc-topic-matrix.json', 'w') as outfile:
    json.dump(data, outfile)

# create a TEI document to store
* top n words of each topic
* document/weights relatede to this topic

In [None]:
cols = list(df.T.columns)
topics = {}
for i, row in df.T.iterrows():
    topics[f"topic {i}"] = {}
    topics[f"topic {i}"]["top_terms"] = data["topic_terms"][i][f"topic {i}"]
    topics[f"topic {i}"]["docs"] = []
    for col in cols:
        match_func = lambda doc: doc._.meta.get("id") == f"{col}"
        cell = {
            "xml_id": col,
            "value": row[col],
            "titel": [x._.meta['title'] for x in corpus.get(match_func, limit=1)][0].replace('\n', ' ').strip()
        }
        topics[f"topic {i}"]["docs"].append(cell)

In [None]:
for key, value in topics.items():
    print(key, value)

In [None]:
tei_stump = TeiReader.tei_stump(
    title="Topics",
    source_desc="erstellt mit textacy"
)

In [None]:
root = ET.fromstring(tei_stump)
body = root.find('.//tei:body', namespaces={'tei':'http://www.tei-c.org/ns/1.0'})
for key, value in topics.items(): 
    topic_div = ET.Element("{http://www.tei-c.org/ns/1.0}div")
    topic_div.attrib['n'] = f"{key}"
    body.append(topic_div)
    list_el = ET.Element("{http://www.tei-c.org/ns/1.0}list")
    topic_div.append(list_el)
    table_el = ET.Element("{http://www.tei-c.org/ns/1.0}table")
    table_el.attrib['n'] = f"{key}"
    topic_div.append(table_el)
    for item in value['top_terms']:
        item_el = ET.Element("{http://www.tei-c.org/ns/1.0}item")
        item_el.text = item
        list_el.append(item_el)
    for item in value['docs']:
        table_row_el = ET.Element("{http://www.tei-c.org/ns/1.0}row")
        table_el.append(table_row_el)
        for c_key, c_value in item.items():
            cell_el = ET.Element("{http://www.tei-c.org/ns/1.0}cell")
            cell_el.text = f"{c_value}"
            table_row_el.append(cell_el)    

In [None]:
file = os.path.join('../indices', 'topic-model.xml')

In [None]:
with open(file, 'wb') as f:
    f.write(ET.tostring(root, pretty_print=True, encoding='utf-8'))