In [89]:
'''this notebook heavily refers to the post https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/'''

In [128]:
import nltk
#nltk.download('stopwords')
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#for Stemming
from nltk.stem.snowball import DanishStemmer

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [129]:
import os,sys
parentPath = os.path.join(os.path.dirname(os.getcwd()),"tools")
sys.path.insert(0, parentPath)
from scripts_py import preprocessing as pre

#### Prepare stopwords

In [130]:
# NLTK Stop words 
# TODO change to our own stop words list
from nltk.corpus import stopwords
stop_words = stopwords.words('danish')
stop_words.extend(['paa','ved','saa','den'])

#### read in articles

In [131]:
data = list()
#add any articles as you like
#articles = ["aare01val_workid69870.txt","aakjaer01val_workid55565.txt","aakjaer01val_workid55662.txt"
 #           ,"aakjaer01val_workid55881.txt"]
articles=os.listdir(os.path.join(os.path.dirname(os.getcwd()),"ADL","plain"))
for ele in articles:
    article = os.path.join(os.path.dirname(os.getcwd()),"ADL","plain",ele)
    with open(article, encoding ="utf-8") as f:
        lines = f.readlines()
    data.append( ' '.join(lines))

#### Preprocessing

In [132]:
nalpha_content = [pre.re_nalpha(ele) for ele in data]
tokenized_content = [pre.tokenizer(ele) for ele in nalpha_content]

In [None]:
tokenized_content[:1]

In [134]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def stemmer(wordList):
    """stemming the words for later processing"""
    stemmer = DanishStemmer()
    return [stemmer.stem(tempWord) for tempWord in wordList]

In [135]:
data_words_nostops=remove_stopwords(tokenized_content)

In [136]:
bigram = gensim.models.Phrases(tokenized_content, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram = gensim.models.Phrases(bigram[tokenized_content], threshold=100)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

In [137]:

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_stemmed = [stemmer(ele) for ele in data_words_bigrams]
data_stemmed=remove_stopwords(data_stemmed)

In [138]:
# Create Dictionary
id2word = corpora.Dictionary(data_stemmed)
# Create Corpus
texts = data_stemmed

In [139]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [144]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [145]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]


[(0,
  '0.082*"så" + 0.019*"sen" + 0.010*"få" + 0.010*"fortvivl" + 0.009*"måt" + '
  '0.008*"arv" + 0.007*"gå" + 0.006*"ff" + 0.006*"flad" + 0.006*"skæg"'),
 (1,
  '0.039*"oss" + 0.029*"met" + 0.005*"aldri" + 0.004*"hemming" + 0.003*"wor" + '
  '0.003*"sua" + 0.003*"siin" + 0.002*"føst" + 0.002*"ac" + 0.002*"liff_oc"'),
 (2,
  '0.025*"dit" + 0.010*"hjert" + 0.009*"himl" + 0.009*"kun" + 0.008*"hver" + '
  '0.007*"sorg" + 0.007*"sød" + 0.007*"gud" + 0.007*"liv" + 0.006*"evig"'),
 (3,
  '0.086*"hr" + 0.049*"seng" + 0.027*"led" + 0.017*"slaa" + 0.013*"spilled" + '
  '0.013*"ked" + 0.012*"dø" + 0.012*"saamænd" + 0.011*"nar" + '
  '0.011*"kammerat"'),
 (4,
  '0.026*"hand" + 0.024*"ey" + 0.022*"gud" + 0.016*"kand" + 0.014*"naad" + '
  '0.011*"ieg" + 0.008*"udi" + 0.008*"vel" + 0.008*"mand" + 0.008*"dend"'),
 (5,
  '0.055*"gansk" + 0.039*"vist" + 0.016*"grund" + 0.016*"tre" + 0.015*"frøk" + '
  '0.014*"aa" + 0.012*"mørk" + 0.011*"mindr" + 0.010*"fruen" + 0.009*"naa"'),
 (6,
  '0.026*"saae" + 0

In [126]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_stemmed, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.63565437474

Coherence Score:  0.436928849656


In [146]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis