## Prerequisites

In [None]:
import pandas as pd
import string
import os
import re
import time

# input files
data_dir        = '../data/'
filename        = data_dir + 'Bundesregierung.csv'

# output files
corpus_dir      = '../corpus/'
dict_filename   = corpus_dir + 'gps.dict'
corpus_filename = corpus_dir + 'gps_bow.mm'

# ensure directories exist
if not os.path.exists(corpus_dir):
    os.makedirs(corpus_dir)

In [None]:
start_time = time.time()
df = pd.read_csv(filename)
print(len(df), 'speeches imported')
print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))

df.head()

## Data Preparation

* tokenize (split words, n-grams)
* convert to lower case
* remove punctuation (handle abbreviations, split sentences)
* lemmatize

inspired by
* gensim tutorial
* ...


In [None]:
print(df['text'].iloc[0][0:500])

In [None]:
# define stop words
stoplist = 'und oder man'.split()

# define punctuation to be removed
punct_trans = str.maketrans({key:None for key in string.punctuation})

def tokenize(text):
    """Tokenize a text and return a list of cleaned tokens."""
    return [re.sub(r'\d+', '', word.translate(trans)) for word in text.lower().split() if word not in stoplist]

In [None]:
# show intermediate result
print(tokenize(df['text'].iloc[0])[0:200])

### Remove Infrequent Tokens (Single Occurrence)

In [None]:
from collections import defaultdict

# store token frequency counts in dictionary
frequency = defaultdict(int)

start_time = time.time()
for doc in df['text']:
    for token in tokenize(doc):
        frequency[token] += 1
print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))

once = len([v for v in frequency.values() if v == 1])

print(len(frequency), "words in dictionary")
print(once, "words with one occurrence")
print(len(frequency)-once, "words with multiple occurrence")

In [None]:
start_time = time.time()
texts = [[token for token in tokenize(doc) if token != '' and frequency[token] > 1 ] for doc in df['text']]
print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))

In [None]:
# show final results
print(texts[0][0:200])

# Create Dictionary

In [None]:
from gensim import corpora

print('Creating Dictionary')
start_time = time.time()
dictionary = corpora.Dictionary(texts)
print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))

In [None]:
print('Saving Dictionary')
start_time = time.time()
dictionary.save(dict_filename)
print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))

print(dictionary)

# Create Corpus

In [None]:
print('Creating Corpus')
start_time = time.time()
corpus_bow = [dictionary.doc2bow(doc) for doc in texts]
print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))

In [None]:
print('Saving Corpus')
start_time = time.time()
corpora.MmCorpus.serialize(corpus_filename, corpus_bow)
print("--- took %d:%.2d minutes ---" % divmod(time.time() - start_time, 60))

print(corpus_bow[0:10])