# Create LDA for Corpus<a id='top'></a>

0. Download an available corpus or create a new one. For the latter, create a JSON file for each subcorpus/group of texts of your corpus; each text is then a line in a JSON file. One way to do this is to crawl websites using [scrapy](https://scrapy.org) with these flags: "-o result.json -t json" (see [sample crawlers](./scripts/scraper/spiders) and [example item file](./scripts/scraper/items.py)). An example JSON file is [here](./scripts/example.json).
1. [Prepare corpus for the LDA](#prepare). This notebook demonstrates how to load a (German) TEI xml, extract metadata and texts and filter unwanted POS (only nouns are left). The result is then saved as a json which can be used in the subsequent cells. You can also prepare your corpus externally, see my [example](./scripts/text.py) which is tailored to Russian texts. It removes all non-cyrillic characters, removes all words which are not nouns and sets all nouns into first person singular using POS tagging. The result is again saved in a json file
2. [Create LDA model for the corpus](#create)
3. [Compute topic distribution for corpus](#compute)
4. [Explore corpus](corpus.ipynb) (different notebook)

Due to copyright reasons I cannot publish the scraped raw data. The results of the smoothing process in step 2 are [here](./projects/); they are used in the examples below.

In [None]:
import os
import sys
from gensim import corpora, models
import logging
import errno
import pandas as pd
from dateutil import parser
import pytz
import numpy as np
import json
import xml.etree.ElementTree as ET
import re
from tqdm import tqdm
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# set global paths for corpus etc.
experiment = "alpenwort_noun"
raw_path = "projects" + os.path.sep + experiment + os.path.sep + "raw"
corpus_path = "projects" + os.path.sep + experiment + os.path.sep + "raw"
result_path = "projects" + os.path.sep + experiment + os.path.sep + "results"
model_name = "model"
topics_name = "topics"

## Prepare corpus<a id='prepare'></a>

This cell demonstrates how to load a German TEI xml, extract metadata and texts and filter unwanted POS

[Back to top](#top)

In [None]:
# keep_only = "ADJ"
keep_only = "NOUN"

import spacy
!{sys.executable} -m spacy download de_core_news_sm
nlp = spacy.load('de_core_news_sm')

for xml_file in tqdm(sorted(os.listdir(raw_path))):
    output_json = []
    if xml_file.endswith(".xml"):
        # get TEI xml data
        tree = ET.parse(os.path.join(raw_path, xml_file))
        root = tree.getroot()
        text = []
        for text_node in root.findall(".//{*}text"):
            entry = {}
            entry["title"] = text_node.get("title")
            entry["url"] = xml_file
            entry["date"] = text_node.get("year")
            entry["author"] = text_node.get("author")
            entry["comment_count"] = 0
            entry["text"] = []
            for txt in text_node:
                # POS filtering
                if txt.text is not None and len(txt.text.split())> 3:
                    doc = nlp(txt.text)
                    for w in doc:
                        if w.pos_ == keep_only:
                            entry["text"].append(w.orth_)
            output_json.append(entry)

    with open(os.path.join(corpus_path, xml_file.split(".")[0] + ".json"), 'w') as outfile:
        json.dump(output_json, outfile)

## Create LDA model for corpus<a id='create'></a>

This cell creates the topic model for the specified corpus stored in JSON files

[Back to top](#top)

In [None]:
number_of_topics = 50

try:
    os.makedirs(result_path)
except OSError as exception:
    if exception.errno != errno.EEXIST:
        raise

# load corpus
corpus = []   
try:
    # load prepared corpus
    corpus = corpora.MmCorpus(os.path.join(result_path, model_name + ".corp"))
    dictionary = corpora.Dictionary.load(os.path.join(result_path, model_name + ".dict"))
except FileNotFoundError:
    # convert json corpus
    for json_file in sorted(os.listdir(corpus_path)):
        print("File: ", json_file)
        if json_file.endswith(".json"):
            # get data
            json_data = open(os.path.join(corpus_path, json_file))
            data = json.load(json_data)
            json_data.close()
            for entry in data:
                try:
                    corpus.append(entry["text"].split())
                except AttributeError:
                    corpus.append(entry["text"])

    print("File extraction complete.")

    dictionary = corpora.Dictionary(corpus)
    dictionary.save(os.path.join(result_path, model_name + ".dict"))

    corpus = [dictionary.doc2bow(text) for text in corpus]
    corpora.MmCorpus.serialize(os.path.join(result_path, model_name + ".corp"), corpus)    

lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=number_of_topics, alpha='auto', eval_every=5, passes=20)

start = 1
while os.path.isfile(os.path.join(result_path, model_name + "-" +str(start)+ ".lda")):
    start += 1

lda.save(os.path.join(result_path, model_name + "-" +str(start)+ ".lda"))

print("LDA saved as", os.path.join(result_path, model_name + "-" +str(start)+ ".lda"))

## Compute topic distribution for corpus<a id='compute'></a>

[Back to top](#top)

In [None]:
# entries published after max_date are ignored
utc = pytz.UTC
max_date = parser.parse("2014-12-31 23:59:59")
max_date_utc = utc.localize(parser.parse("2014-12-31 23:59:59"))

number = 0
for f in os.listdir(result_path):
    try:
        number = max(number, int(f.split(model_name+"-")[1].split(".lda")[0]))
    except IndexError:
        continue
if number > 0:
    file_name = model_name + "-" + str(number) + ".lda"
else: 
    file_name = model_name + ".lda"

# load LDA model and dictionary
dictionary = corpora.Dictionary.load(os.path.join(result_path, model_name + ".dict"))
model = models.LdaModel.load(os.path.join(result_path, file_name))

# new fields for compatibility, default values from
# https://radimrehurek.com/gensim/models/ldamodel.html
try:
    x = model.minimum_probability
except AttributeError:
    model.minimum_probability = 0.01
    model.minimum_phi_value = 0.01
    model.per_word_topics = False
    model.random_state = np.random.RandomState()

columns = ['group', 'url', 'date', 'comment_count', 'words']
columns.extend([str(topic) for topic in range(model.num_topics)])

result = []

# sort files
for json_file in sorted(os.listdir(corpus_path)):

    print("File: ", json_file)

    if json_file.endswith(".json"):
        # get data
        with open(os.path.join(corpus_path, json_file)) as json_data:
            data = json.load(json_data)

        removed = 0
        too_short = 0

        for entry in data:
            # check if entry is within data range
            try:
                date = parser.parse(entry["date"])
                try:
                    if date > max_date:
                        removed += 1
                        continue
                except TypeError:
                    if date > max_date_utc:
                        removed += 1
                        continue
            except ValueError:
                print("Wrong format", entry["date"])

            # get topic distribution for entry
            line = {}
            try:
                text = entry["text"].split(" ")
            except AttributeError:
                text = entry["text"]
                
            # filter too short entries
            if len(text) < 5:
                too_short += 1
                continue

            topics = [0] * model.num_topics
            for (topic, prop) in model[dictionary.doc2bow(text)]:
                topics[topic] = prop
            line["group"] = json_file.split(".json")[0]
            line["url"] = entry["url"]
            line["date"] = entry['date']
            line["words"] = len(text)
            line["comment_count"] = entry["comment_count"]
            for counter in range(len(topics)):
                line[str(counter)] = topics[counter]
            result.append(line)

        print("Total number of entries:", len(data))
        print("Removed because of date: ", removed)
        print("Removed because too short: ", too_short)
        print("Remaining:", (len(data) - removed - too_short))
            
frame = pd.DataFrame(result)
print(columns)
frame = frame[columns]
start = 1
while os.path.isfile(os.path.join(result_path, topics_name + "-" +str(start)+ ".json")):
    start += 1

frame.to_json(os.path.join(result_path, topics_name + "-" + str(start) + ".json"), orient='split')
print ("Created", os.path.join(result_path, topics_name + "-" + str(start) + ".json"))