In [1]:
from cophi_toolbox import preprocessing
from dariah_topics import postprocessing
from dariah_topics import visualization
import metadata_toolbox.utils as metadata
from pathlib import Path
import pandas as pd
import lda



In [2]:
path_to_corpus = Path('data', 'grenzboten_sample')
pattern = '{author}_{year}_{title}'

In [3]:
meta = pd.concat([metadata.fname2metadata(str(path), pattern=pattern) for path in path_to_corpus.glob('*.txt')])
meta[:5] # by adding '[:5]' to the variable, only the first 5 elements will be printed

Unnamed: 0,author,year,title
data\grenzboten_sample\Beck_1844_Tagebuch_56.txt,Beck,1844,Tagebuch_56
data\grenzboten_sample\Berto_1915_Kriegstagebuch_94.txt,Berto,1915,Kriegstagebuch_94
data\grenzboten_sample\Castelli_1846_Tagebuch_51.txt,Castelli,1846,Tagebuch_51
data\grenzboten_sample\Cleinom_1914_Kriegstagebuch_94.txt,Cleinom,1914,Kriegstagebuch_94
data\grenzboten_sample\Dix_1914_Kriegstagebuch_37.txt,Dix,1914,Kriegstagebuch_37


In [4]:
corpus = list(preprocessing.read_files(meta.index))
corpus[0][:255] # printing the first 255 characters of the first document

'Tagebuch von Karl Beck. Man spricht seit vierzehn Tagen von einem vollständigen Ministerwechsel und es circuliren im Publicum die verschiedensten Combinationen, wobei heute ganz andere Namen genannt werden, als gestern und morgen wieder andere, als heute.'

In [5]:
tokenized_corpus = [list(preprocessing.tokenize(document)) for document in corpus]
tokenized_corpus[0][0:13]

['tagebuch',
 'von',
 'karl',
 'beck',
 'man',
 'spricht',
 'seit',
 'vierzehn',
 'tagen',
 'von',
 'einem',
 'vollständigen',
 'ministerwechsel']

In [6]:
document_term_matrix = preprocessing.create_document_term_matrix(tokenized_corpus, meta['title'])
document_term_matrix[:5]

Unnamed: 0,die,der,und,in,den,von,zu,das,des,nicht,...,weitlinge,weitschichtige,welker,welscher,werthschätzung,wesentlicher,wichtigeren,widerliche,widersetzlichen,gasfrage
Tagebuch_56,90.0,92.0,84.0,70.0,30.0,26.0,25.0,16.0,25.0,23.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kriegstagebuch_94,11.0,32.0,24.0,12.0,8.0,17.0,0.0,3.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tagebuch_51,226.0,177.0,188.0,111.0,73.0,62.0,93.0,60.0,35.0,78.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kriegstagebuch_94,39.0,48.0,34.0,28.0,15.0,25.0,4.0,5.0,11.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kriegstagebuch_37,40.0,34.0,15.0,17.0,10.0,19.0,5.0,6.0,18.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
stopwords = preprocessing.list_mfw(document_term_matrix, most_frequent_tokens=100)

In [8]:
hapax_legomena = preprocessing.find_hapax_legomena(document_term_matrix)
print("Total number of types in corpus:", document_term_matrix.shape[1])
print("Total number of hapax legomena:", len(hapax_legomena))

Total number of types in corpus: 24451
Total number of hapax legomena: 19757


In [9]:
path_to_stopwordlist = Path('data', 'stopwords', 'de.txt')
external_stopwords = [line.strip() for line in path_to_stopwordlist.open('r', encoding='utf-8')]

In [10]:
features = stopwords + hapax_legomena + external_stopwords
document_term_matrix = preprocessing.remove_features(features, document_term_matrix=document_term_matrix)

In [11]:
vocabulary = document_term_matrix.columns
vocabulary

Index(['franzosen', 'genommen', 'abgewiesen', 'südlich', 'berlin', 'lassen',
       'geschütze', 'englische', 'deutschland', 'januar',
       ...
       'schlechteste', 'dubatowka', 'palameix', 'verschlossene', 'eimer',
       'schicksale', 'eilwagen', 'klippe', 'rennt', 'zuschrieb'],
      dtype='object', length=4242)

In [12]:
document_term_matrix_arr = document_term_matrix.values.astype(int)
document_term_matrix_arr

array([[ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  9,  3, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0],
       ...,
       [ 6,  6, 19, ...,  0,  0,  0],
       [ 3,  2,  0, ...,  0,  0,  0],
       [ 1,  3,  0, ...,  0,  0,  0]])

In [13]:
%%time

model = lda.LDA(n_topics=10, n_iter=1000)
model.fit(document_term_matrix_arr)

Wall time: 9.63 s


In [14]:
topics = postprocessing.show_topics(model=model,
                                    vocabulary=vocabulary)
topics

NameError: name 'model' is not defined

In [None]:
document_topics = postprocessing.show_document_topics(model=model,
                                                      topics=topics,
                                                      document_labels=meta['title'])
document_topics

# Visualizations

### `PlotDocumentTopics`

In [None]:
from bokeh.io import output_notebook, show
output_notebook()
%matplotlib inline

In [None]:
PlotDocumentTopics = visualization.PlotDocumentTopics(document_topics)

### `static_heatmap`

In [None]:
static_heatmap = PlotDocumentTopics.static_heatmap(colorbar=False)

### `static_barchart_per_topic`

In [None]:
static_barchart_per_topic = PlotDocumentTopics.static_barchart_per_topic(index=0, # or index='abgewiesen südlich genommen'
                                                                         describer='Topic',
                                                                         alpha=None,
                                                                         figsize=(11, 7))

### `static_barchart_per_document`

In [None]:
static_barchart_per_document = PlotDocumentTopics.static_barchart_per_document(index=0,
                                                                               describer='Document')

### `interactive_heatmap`

In [None]:
interactive_heatmap = PlotDocumentTopics.interactive_heatmap(width=800,
                                                             height=550,
                                                             colorbar=False)
show(interactive_heatmap, notebook_handle=True)

### `interactive_barchart_per_topic`

In [None]:
interactive_barchart_per_topic = PlotDocumentTopics.interactive_barchart_per_topic(index=0,
                                                                                   describer='Topic',
                                                                                   width=800)
show(interactive_barchart_per_topic, notebook_handle=True)

### `interactive_barchart_per_document`

In [None]:
interactive_barchart_per_document = PlotDocumentTopics.interactive_barchart_per_document(index=0,
                                                                                         describer='Document',
                                                                                         width=800)
show(interactive_barchart_per_document, notebook_handle=True)