In [None]:
#!/usr/bin/env python3

"""
Build a corpus for gensim from individual text files.
"""

# == imports ==

import os
import glob
from os.path import join
import re
import pandas as pd
from shutil import copyfile

from gensim import corpora
from gensim import models
import pyLDAvis
import pyLDAvis.gensim
from gensim.models.coherencemodel import CoherenceModel

import datetime
import logging

import warnings
warnings.simplefilter("ignore")

# == general parameters == 

workingdir = join("/", "media", "christof", "mydata", "Dropbox-alt", "0-Analysen", "2018", "poesia", "")
corpusfiles = [
#join(workingdir, "data", "raw", "17sonetos.txt"),
join(workingdir, "data", "sel", "17sonetos.txt"),
join(workingdir, "data", "sel", "17poesias.txt"),
join(workingdir, "data", "sel", "18poesias.txt"),
join(workingdir, "data", "sel", "19disco.txt"),
join(workingdir, "data", "sel", "19poesias.txt"),
join(workingdir, "data", "sel", "20poesias.txt"),
join(workingdir, "data", "sel", "21poesias.txt"),
]

stoplistfile = join(workingdir, "gensim", "stoplist-es_lemmas.txt")

# == model parameters == 

numtopics = 40


# == logging and resultsfolder == 

timestamp,ms = datetime.datetime.now().isoformat().split(".")
resultsfolder = join(workingdir, "gensim", str(timestamp)+"_17sonetos", "")
if not os.path.exists(resultsfolder): 
    os.makedirs(resultsfolder)

logging.basicConfig(
   filename = join(resultsfolder, "modeling.log"), 
   format='%(asctime)s : %(levelname)s : %(message)s', 
   level=logging.INFO)


# == functions == 

def read_corpusfiles(corpusfiles, stoplistfile): 
    with open(stoplistfile, "r", encoding="utf8") as infile: 
        stoplist = infile.read().splitlines()
    listcorpus = []
    for corpusfile in corpusfiles: 
        with open(corpusfile, "r", encoding="utf8") as infile: 
            corpus = infile.read().splitlines()
            #onelistcorpus = [[token for token in re.split("\W+", text) if len(token) > 3] for text in corpus]
            onelistcorpus = [[token for token in re.split("\W+", text) if token not in stoplist] for text in corpus]
            listcorpus.extend(onelistcorpus)
    listcorpus = [[token for token in line if token] for line in listcorpus]
    #print(listcorpus)
    return listcorpus


def copy_stoplistfile(stoplistfile, resultsfolder): 
    basename = os.path.basename(stoplistfile)
    copyfile(stoplistfile, join(resultsfolder, join(resultsfolder, basename)))


def build_vectorcorpus(listcorpus, resultsfolder): 
    dictcorpus = corpora.Dictionary(listcorpus)
    dictcorpus.save(join(resultsfolder, "corpus.dict"))
    vectorcorpus = [dictcorpus.doc2bow(text) for text in listcorpus]
    print("number of types", len(dictcorpus))
    #print(dictcorpus)
    #print(dictcorpus.token2id)
    #print(vectorcorpus)
    return dictcorpus, vectorcorpus


def build_model_multicore(dictcorpus, vectorcorpus, numtopics, workingdir, timestamp): 
    model = models.ldamulticore.LdaMulticore(
        corpus=vectorcorpus,
        id2word=dictcorpus,
        num_topics=numtopics, 
        #random_state=100,
        #update_every=1000,
        #chunksize=100,
        passes=500,
        workers=3,
        per_word_topics=True)
    model.save(join(resultsfolder, "model.gensim"))
    return model


def build_model_singlecore(dictcorpus, vectorcorpus, numtopics, workingdir, timestamp): 
    model = models.ldamodel.LdaModel(
        corpus=vectorcorpus,
        id2word=dictcorpus,
        num_topics=numtopics, 
        #random_state=100,
        update_every=1000,
        chunksize=1000,
        passes=500,
        alpha='auto',
        per_word_topics=True)
    model.save(join(resultsfolder, "model.gensim"))
    return model



def get_topics(model, numtopics, resultsfolder): 
    topics = []
    for i in range(0,numtopics): 
        topic = model.show_topic(i, topn=500)
        topic = list(zip(*topic))
        topic = pd.Series(topic[1], index=topic[0], name=str(i))
        topics.append(topic)
    topics = pd.concat(topics, axis=1, keys=[topic.name for topic in topics], sort=False)
    topics = topics.fillna(0)
    with open(join(resultsfolder, "topics.csv"), "w", encoding="utf8") as outfile: 
        topics.to_csv(outfile, sep="\t")
    


def visualize_model(model, dictcorpus, vectorcorpus, resultsfolder):
    visualization = pyLDAvis.gensim.prepare(
        model, 
        vectorcorpus, 
        dictcorpus, 
        sort_topics=False)
    pyLDAvis.save_html(visualization, join(resultsfolder, "visualization.html"))


def check_coherence(listcorpus, vectorcorpus, model, numtopics, resultsfolder): 
    # coherence for the entire model, using several measures
    measures = ["c_v", "c_npmi", "u_mass", "c_uci"]
    coherences = []
    for measure in measures: 
        coherencemodel = CoherenceModel(texts=listcorpus, model=model, corpus=vectorcorpus, coherence=measure, processes=3)
        coherence = coherencemodel.get_coherence()
        coherences.append(coherence)
    coherences = dict(zip(measures, coherences))
    coherences = pd.DataFrame.from_dict(coherences, orient='index', columns=["score"])
    with open(join(resultsfolder, "coherences-model.csv"), "w", encoding="utf8") as outfile: 
        coherences.to_csv(outfile, sep="\t")
    # coherence of each topic, using one measure only
    coherencemodel = CoherenceModel(texts=listcorpus, model=model, corpus=vectorcorpus, coherence="c_v", processes=3)    
    coherences = list(zip(range(0,numtopics), coherencemodel.get_coherence_per_topic()))
    coherences = pd.DataFrame(coherences, columns=["topic", "score"]).sort_values(by="score", ascending=False)
    with open(join(resultsfolder, "coherences-topics.csv"), "w", encoding="utf8") as outfile: 
        coherences.to_csv(outfile, sep="\t")


# == main == 

def main(workingdir, corpusfiles, stoplistfile, resultsfolder, numtopics):
    copy_stoplistfile(stoplistfile, resultsfolder)
    listcorpus = read_corpusfiles(corpusfiles, stoplistfile)
    dictcorpus, vectorcorpus = build_vectorcorpus(listcorpus, resultsfolder)
    #model = build_model_multicore(dictcorpus, vectorcorpus, numtopics, workingdir, resultsfolder)
    model = build_model_singlecore(dictcorpus, vectorcorpus, numtopics, workingdir, resultsfolder)
    #model = models.ldamodel.LdaModel.load(join(workingdir, "gensim", "2018-08-22T15:33:09_poesia-17", "model.gensim"))
    topics = get_topics(model, numtopics, resultsfolder)
    visualize_model(model, dictcorpus, vectorcorpus, resultsfolder)
    check_coherence(listcorpus, vectorcorpus, model, numtopics, resultsfolder)
      
main(workingdir, corpusfiles, stoplistfile, resultsfolder, numtopics)
