### Import

In [1]:
import os
import sys
import numpy as pd
import pandas as pd
import gensim
import re
import nltk
from pprint import pprint
from gensim import corpora, models
from nltk import PorterStemmer
from tqdm import tqdm
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

### Download NLTK Data

In [2]:
#nltk.download('wordnet')

### Read all lines in a doc and cleaning it

In [3]:
def readtxt(filename):
    
    lines = []
    with open(filename,'r') as f:
        for line in f:
            line = line.rstrip()
            if (len(line) > 0):
                line = re.sub(r'[^a-zA-Z0-9_]+', ' ', line)
                line = line.lower()
                lines.append(line)
    lines = ' '.join(lines)
    return lines

### Load Database

In [4]:
database = pd.read_csv('../data/database.csv')

### Restrict to English and load a subsampling

In [5]:
documents = []
database = database[database['language'] == 'english']
subdata = database.sample(250)
for filename in subdata['filename'].values:
    doc = readtxt(filename)
    documents.append(doc)

### NLTK Preprocess helper

In [6]:
def preprocess(text):
    
    result = []
    stemmer = PorterStemmer() 
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            #token = WordNetLemmatizer().lemmatize(token, pos='v')
            result.append(token)
    return result

### Preprocess

In [7]:
documents_processed = []
for document in tqdm(documents):
    document = preprocess(document)
    documents_processed.append(document)

100%|██████████| 250/250 [00:25<00:00, 10.76it/s]


### Build Dictionary

In [8]:
dictionary = gensim.corpora.Dictionary(documents_processed)

### Filter out extremes

In [9]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

### Doc 2 Bow

In [10]:
bow_corpus = [dictionary.doc2bow(doc) for doc in documents_processed]

### Sanity Check

In [11]:
bowdoc = bow_corpus[0]
for i in range(len(bowdoc)):
    pass
    #print("Word {} (\"{}\") appears {} time.".format(bowdoc[i][0], dictionary[bowdoc[i][0]], bowdoc[i][1]))

### TF-iDF

In [12]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### LDA

In [13]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)\

### Print Results

In [14]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.001*"shakespeare" + 0.001*"titanic" + 0.000*"clara" + 0.000*"wagner" + 0.000*"cask" + 0.000*"wharf" + 0.000*"casks" + 0.000*"ellis" + 0.000*"agin" + 0.000*"marie"
Topic: 1 Word: 0.001*"maurice" + 0.001*"polly" + 0.001*"sandy" + 0.001*"ginger" + 0.001*"jackson" + 0.000*"monsieur" + 0.000*"crane" + 0.000*"witches" + 0.000*"margaret" + 0.000*"israel"
Topic: 2 Word: 0.001*"jean" + 0.001*"thou" + 0.000*"writ" + 0.000*"foster" + 0.000*"thee" + 0.000*"maurice" + 0.000*"maid" + 0.000*"walter" + 0.000*"ships" + 0.000*"crows"
Topic: 3 Word: 0.001*"solomon" + 0.001*"metaphor" + 0.000*"healing" + 0.000*"eliza" + 0.000*"negroes" + 0.000*"missionary" + 0.000*"punch" + 0.000*"jimmy" + 0.000*"flora" + 0.000*"palmer"
Topic: 4 Word: 0.001*"morris" + 0.001*"footnote" + 0.000*"feller" + 0.000*"violin" + 0.000*"madame" + 0.000*"leon" + 0.000*"coventry" + 0.000*"carter" + 0.000*"jesus" + 0.000*"lavender"
Topic: 5 Word: 0.001*"thou" + 0.001*"thee" + 0.001*"aunt" + 0.001*"christ" + 0.001*"dic