In [1]:
import gensim
from gensim.parsing.preprocessing import STOPWORDS
import glob
import itertools
from nltk.tokenize import word_tokenize
import os
from tqdm import tqdm

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [3]:
def head(stream, n=10):
    return list(itertools.islice(stream, n))

In [4]:
# Given a filename, return just the unique id
def get_item_id(fn):
    return os.path.split(fn)[1].split(".")[0]

In [5]:
def tokenize(text):
    tokens = [token for token in word_tokenize(text) if token.lower() not in STOPWORDS]
    cleaned = [token for token in tokens if token.isalpha()]
    cleaned_greater_1 = [token for token in cleaned if len(token) > 1]
    return cleaned_greater_1

In [6]:
def text_stream(text_dir):
    # yield each item ocr as (id, tokens) tuple
    for fn in glob.glob(f"{text_dir}/*.txt"):
        item_id = get_item_id(fn)
        with open(fn, 'r') as f:
            document = f.read()
            yield(item_id, tokenize(document))

In [7]:
text_dir = "texts"

In [8]:
head(text_stream(text_dir), n=2)

[('mc00456-001-bx0004-043-001',
  ['MORAL',
   'ASPECT',
   'VIVISECTION',
   'JANE',
   'VVHATELY',
   'instruction',
   'encouraga',
   'ment',
   'thought',
   'question',
   'Vivisection',
   'recall',
   'words',
   'persons',
   'eminent',
   'high',
   'qualities',
   'intellect',
   'moral',
   'character',
   'passed',
   'judgment',
   'Miss',
   'Jane',
   'VVhately',
   'daughter',
   'Archbishop',
   'respected',
   'trusted',
   'loved',
   'common',
   'degree',
   'large',
   'olrcle',
   'friends',
   'acquaintances',
   'preface',
   'short',
   'memoir',
   'sister',
   'published',
   'following',
   'tribute',
   'worth',
   'pen',
   'author',
   'Got',
   'ta',
   'Family',
   'fix',
   'quality',
   'especially',
   'characteristic',
   'truth',
   'perception',
   'rested',
   'entire',
   'truthfulness',
   'character',
   'true',
   'core',
   'mind',
   'heart',
   'True',
   'candid',
   'acknowledge',
   'difﬁculties',
   'thought',
   'memory',
   'tolera

In [9]:
for item_id, tokens in head(text_stream(text_dir), n=5):
    print(item_id, tokens[:10])

mc00456-001-bx0004-043-001 ['MORAL', 'ASPECT', 'VIVISECTION', 'JANE', 'VVHATELY', 'instruction', 'encouraga', 'ment', 'thought', 'question']
mc00456-001-bx0004-053-001 ['ecial', 'Repert', 'Emu', 'BM', 'OW', 'NATNNAL', 'ALTN', 'MEDHAL', 'CUMMWTEE', 'Repmft']
mc00344-001-lb0001_26-002-000 ['Sydney', 'Daily', 'Telegraph', 'August', 'Cattle', 'producers', 'want', 'meat', 'eXport', 'inquiry']
mc00456-001-bx0007-015-001 ['EDHWON', 'ABOMINABLE', 'SIN', 'Lord', 'Shaftesbury', 'VIVISECTION', 'APPEAL', 'Scientific', 'Ethical', 'Thinkers']
mc00456-001-bx0007-005-001 ['UNSOIENTIFIC', 'VIEW', 'VIVISECTION', 'LADY', 'PAGET', 'Reprinted', 'NATIONAL', 'REVIEW', 'September', 'years']


In [10]:
# You could extract the item_ids from the full text_stream, but in order to not
# tokenize everything when we don't yet need to
item_ids = [get_item_id(fn) for fn in glob.glob(f"{text_dir}/*.txt")]
head(item_ids)

['mc00456-001-bx0004-043-001',
 'mc00456-001-bx0004-053-001',
 'mc00344-001-lb0001_26-002-000',
 'mc00456-001-bx0007-015-001',
 'mc00456-001-bx0007-005-001',
 'mc00344-001-bx0001_35-003-000',
 'mc00344-001-bx0001_38-004-000',
 'mc00456-001-bx0001-020-001',
 'aspca-scrapbooks-bx0001-002-001_0_20191213_759',
 'mc00344-001-bx0001_5-001-000']

In [11]:
doc_stream = (tokens for _, tokens in text_stream(text_dir))

In [12]:
%time id2word_items = gensim.corpora.Dictionary(doc_stream)

CPU times: user 3min 14s, sys: 959 ms, total: 3min 15s
Wall time: 3min 16s


In [13]:
print(id2word_items)

Dictionary(549504 unique tokens: ['ASPECT', 'Archbishop', 'Asiatic', 'Brain', 'Close']...)


In [14]:
# Filter words based on occurence in docs
# https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes
id2words_items_filtered = id2word_items.filter_extremes(no_below=2, no_above=1)

In [13]:
# We're building this for use in the LDA model and so we can save it to disk for re-use
class ItemCorpus(object):
    def __init__(self, text_dir, dictionary):
        self.text_dir = text_dir
        self.dictionary = dictionary
        
    def __iter__(self):
        self.item_ids = []
        for item_id, tokens in text_stream(text_dir):
            self.item_ids.append(item_id)
            yield self.dictionary.doc2bow(tokens)

In [14]:
item_corpus = ItemCorpus(text_dir, id2word_items)

In [17]:
# Save serialized corpus for later use
%time gensim.corpora.MmCorpus.serialize("./items_bow_lg_full.mm", item_corpus)

CPU times: user 2min 52s, sys: 278 ms, total: 2min 52s
Wall time: 2min 52s


In [15]:
# You can load models from disk as follows:
loaded_corpus = gensim.corpora.MmCorpus("items_bow_lg_full.mm")
print(loaded_corpus)


MmCorpus(983 documents, 100000 features, 2698762 non-zero entries)


In [19]:
# Switch to MulticoreLDa
%time lda_model = gensim.models.LdaMulticore(loaded_corpus, num_topics=40, id2word=id2word_items, passes=50)

CPU times: user 1h 8min 2s, sys: 7min 54s, total: 1h 15min 57s
Wall time: 20min 34s


In [20]:
lda_model.save('animalturn_40_full.model')

In [16]:
lda_model = gensim.models.LdaModel.load("animalturn_40_full.model")

In [17]:
lda_model.print_topics(-1)

[(0,
  '0.019*"Mrs" + 0.019*"hon" + 0.014*"branch" + 0.014*"sec" + 0.012*"RSPCA" + 0.010*"Mr" + 0.009*"aux" + 0.007*"Miss" + 0.007*"animals" + 0.005*"Inspector"'),
 (1,
  '0.013*"rabbits" + 0.008*"trap" + 0.006*"true" + 0.006*"traps" + 0.005*"rabbit" + 0.005*"time" + 0.005*"fact" + 0.004*"use" + 0.004*"necessary" + 0.004*"certain"'),
 (2,
  '0.006*"Animals" + 0.006*"Humane" + 0.005*"Society" + 0.005*"Boston" + 0.005*"animals" + 0.004*"dog" + 0.004*"cts" + 0.003*"little" + 0.003*"Education" + 0.003*"birds"'),
 (3,
  '0.007*"animals" + 0.004*"man" + 0.004*"animal" + 0.004*"life" + 0.003*"time" + 0.003*"little" + 0.003*"like" + 0.003*"great" + 0.003*"birds" + 0.002*"cruelty"'),
 (4,
  '0.015*"Rooks" + 0.007*"ﬁeld" + 0.005*"Rook" + 0.004*"corn" + 0.004*"years" + 0.003*"crop" + 0.003*"glacier" + 0.003*"food" + 0.003*"cat" + 0.003*"winter"'),
 (5,
  '0.008*"pavement" + 0.007*"block" + 0.006*"wood" + 0.006*"pavements" + 0.006*"paving" + 0.006*"streets" + 0.005*"horses" + 0.005*"New" + 0.005*"

In [18]:
# Write up what this means
lda_model[loaded_corpus[0]]

[(1, 0.05808185), (3, 0.38800886), (18, 0.01830658), (37, 0.5332675)]

In [23]:
import pyLDAvis
import pyLDAvis.gensim

In [24]:
pyLDAvis.enable_notebook()

In [25]:
pyLDAvis.gensim.prepare(lda_model, loaded_corpus, id2word_items)

In [26]:


#visualization following https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization#



In [27]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accepts an ldamodel, a topic number and topn terms of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    return terms

In [28]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(20):
    print("Topic " + str(i) + " |---------------------\n")
    tmp = explore_topic(lda_model, topic_number=i, topn=10, output=True)
    topic_summaries += [tmp[:5]]
    print

term                 frequency

Topic 0 |---------------------

Mrs                  0.019
hon                  0.019
branch               0.014
sec                  0.014
RSPCA                0.012
Mr                   0.010
aux                  0.009
Miss                 0.007
animals              0.007
Inspector            0.005
Topic 1 |---------------------

rabbits              0.013
trap                 0.008
true                 0.006
traps                0.006
rabbit               0.005
time                 0.005
fact                 0.005
use                  0.004
necessary            0.004
certain              0.004
Topic 2 |---------------------

Animals              0.006
Humane               0.006
Society              0.005
Boston               0.005
animals              0.005
dog                  0.004
cts                  0.004
little               0.003
Education            0.003
birds                0.003
Topic 3 |---------------------

animals              0.007
man

In [29]:
topic_summaries

[['Mrs', 'hon', 'branch', 'sec', 'RSPCA'],
 ['rabbits', 'trap', 'true', 'traps', 'rabbit'],
 ['Animals', 'Humane', 'Society', 'Boston', 'animals'],
 ['animals', 'man', 'animal', 'life', 'time'],
 ['Rooks', 'ﬁeld', 'Rook', 'corn', 'years'],
 ['pavement', 'block', 'wood', 'pavements', 'paving'],
 ['birds', 'bird', 'rights', 'species', 'time'],
 ['species', 'animals', 'years', 'year', 'time'],
 ['Aux', 'Streak', 'temple', 'Gaskoin', 'Stapledon'],
 ['animals', 'rate', 'response', 'responses', 'stimulus'],
 ['ﬁned', 'Typical', 'Eng', 'Man', 'labour'],
 ['dog', 'ASPCA', 'animals', 'New', 'York'],
 ['Society', 'Miss', 'President', 'Secretary', 'Humane'],
 ['horse', 'll', 'dog', 'Society', 'street'],
 ['whales', 'whaling', 'Japan', 'whale', 'Japanese'],
 ['animals', 'Animal', 'animal', 'research', 'Act'],
 ['animals', 'New', 'York', 'dog', 'school'],
 ['Belinda', 'Woodpigeons', 'Nlrs', 'HOUSING', 'animals'],
 ['animal', 'shall', 'person', 'animals', 'section'],
 ['Pearson', 'poems', 'Jean', 'M

In [24]:
import pandas as pd

In [25]:
source_id = pd.Series(item_ids)

In [27]:
lda_model.get_document_topics(loaded_corpus[200], minimum_probability=0)

[(0, 1.2416544e-05),
 (1, 1.2416544e-05),
 (2, 1.2416544e-05),
 (3, 0.98426884),
 (4, 1.2416544e-05),
 (5, 1.2416544e-05),
 (6, 1.2416544e-05),
 (7, 1.2416544e-05),
 (8, 1.2416544e-05),
 (9, 1.2416544e-05),
 (10, 1.2416544e-05),
 (11, 1.2416544e-05),
 (12, 1.2416544e-05),
 (13, 1.2416544e-05),
 (14, 1.2416544e-05),
 (15, 1.2416544e-05),
 (16, 1.2416544e-05),
 (17, 1.2416544e-05),
 (18, 0.012310334),
 (19, 1.2416544e-05),
 (20, 1.2416544e-05),
 (21, 1.2416544e-05),
 (22, 1.2416544e-05),
 (23, 1.2416544e-05),
 (24, 1.2416544e-05),
 (25, 1.2416544e-05),
 (26, 1.2416544e-05),
 (27, 1.2416544e-05),
 (28, 1.2416544e-05),
 (29, 1.2416544e-05),
 (30, 1.2416544e-05),
 (31, 1.2416544e-05),
 (32, 1.2416544e-05),
 (33, 0.0029614605),
 (34, 1.2416544e-05),
 (35, 1.2416544e-05),
 (36, 1.2416544e-05),
 (37, 1.2416544e-05),
 (38, 1.2416544e-05),
 (39, 1.2416544e-05)]

In [33]:
# Create headers for our csv file for each topic
headers = ["source_id"]
for i in range(40):
    headers.append(f"topic-{i}")

In [34]:
headers[-1]

'topic-39'

In [35]:
df = pd.DataFrame(columns=headers)
df

Unnamed: 0,source_id,topic-0,topic-1,topic-2,topic-3,topic-4,topic-5,topic-6,topic-7,topic-8,...,topic-30,topic-31,topic-32,topic-33,topic-34,topic-35,topic-36,topic-37,topic-38,topic-39


In [36]:
for i in range(len(item_ids)):
    item_id = item_ids[i]
    new_row = [item_id]
    for k, i in lda_model.get_document_topics(loaded_corpus[i], minimum_probability=0):
        new_row.append(i)
    df.loc[item_id] = new_row    

In [37]:
df.head()

Unnamed: 0,source_id,topic-0,topic-1,topic-2,topic-3,topic-4,topic-5,topic-6,topic-7,topic-8,...,topic-30,topic-31,topic-32,topic-33,topic-34,topic-35,topic-36,topic-37,topic-38,topic-39
mc00456-001-bx0004-043-001,mc00456-001-bx0004-043-001,6.5e-05,0.058134,6.5e-05,0.388011,6.486887e-05,6.486887e-05,6.486887e-05,6.5e-05,6.486887e-05,...,6.486887e-05,6.5e-05,6.486887e-05,6.5e-05,6.5e-05,6.5e-05,6.486887e-05,0.53322,6.486887e-05,6.5e-05
mc00456-001-bx0004-053-001,mc00456-001-bx0004-053-001,1e-06,0.015049,1e-06,1e-06,1.174088e-06,1.174088e-06,1.174088e-06,1e-06,1.174088e-06,...,1.174088e-06,1e-06,1.174088e-06,1e-06,1e-06,1e-06,1.174088e-06,0.268608,1.174088e-06,1e-06
mc00344-001-lb0001_26-002-000,mc00344-001-lb0001_26-002-000,0.001245,0.002097,0.000603,0.008175,5.591592e-07,5.591592e-07,5.591592e-07,0.630573,5.591592e-07,...,5.591592e-07,0.001993,5.591592e-07,0.00415,0.063618,0.003218,5.591592e-07,0.015361,5.591592e-07,0.003627
mc00456-001-bx0007-015-001,mc00456-001-bx0007-015-001,2e-05,2e-05,2e-05,0.199515,2.028433e-05,2.028433e-05,2.028433e-05,2e-05,2.028433e-05,...,2.028433e-05,2e-05,2.028433e-05,2e-05,2e-05,2e-05,2.028433e-05,0.76935,2.028433e-05,2e-05
mc00456-001-bx0007-005-001,mc00456-001-bx0007-005-001,1.6e-05,1.6e-05,1.6e-05,0.485605,1.595253e-05,1.595253e-05,1.595253e-05,1.6e-05,1.595253e-05,...,1.595253e-05,1.6e-05,1.595253e-05,1.6e-05,1.6e-05,1.6e-05,1.595253e-05,0.494458,1.595253e-05,1.6e-05


In [38]:
df.to_csv("doc_topic_probs_model_40.csv")

In [39]:
from sklearn.manifold import TSNE

In [44]:
lda_data = df.drop(["source_id"], axis=1).to_numpy()

In [45]:
# We're using the default perplexity=30 here, but we'll need to play with it since we don't really know how many clusters we roughly expect.
tsne = TSNE()

In [46]:
tsne_embedding = tsne.fit_transform(lda_data)

In [47]:
tsne_df = pd.DataFrame(tsne_embedding, columns=["x", "y"])
tsne_df["hue"] = lda_data.argmax(axis=1)

In [48]:
tsne_df.head()

Unnamed: 0,x,y,hue
0,3.243427,14.538748,37
1,-17.078943,0.908913,24
2,-30.879898,-20.153193,7
3,-4.731989,23.075682,37
4,4.342294,11.940034,37


In [49]:
from bokeh.io import output_file, output_notebook, show
output_notebook()

In [51]:
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import viridis

In [52]:
source = ColumnDataSource(
    data = dict(
        x = tsne_df.x, 
        y = tsne_df.y,
        source_id = df["source_id"],
        colors = [viridis(40)[i] for i in tsne_df.hue],
        hue = tsne_df.hue 
    )
)

In [53]:
tooltips = [
    ("source_id", "@source_id"),
    ("topic", "@hue")
]

In [54]:
plot_title = "tSNE embedding of 40 topic model - Animal Turn"

In [55]:
plot_tsne = figure(plot_width=800, plot_height=800, title=plot_title, tooltips=tooltips)

In [57]:
plot_tsne.scatter("x", "y", source=source, fill_color="colors", fill_alpha=0.8, size=10)

In [58]:
show(plot_tsne)

In [59]:
from bokeh.io import save

In [60]:
save(plot_tsne, "animal_turn_tsne_40.html", title="animal_turn_tsne")

'/Users/csbaile3/projects/animal_turn/animal_turn_tsne_40.html'