In [11]:
import gensim
from gensim.parsing.preprocessing import STOPWORDS
import glob
import itertools
import nltk
from nltk.tokenize import word_tokenize
import os
from tqdm import tqdm

In [12]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [4]:
def head(stream, n=10):
    return list(itertools.islice(stream, n))

In [5]:
# Given a filename, return just the unique id
def get_item_id(fn):
    return os.path.split(fn)[1].split(".")[0]

In [6]:
def tokenize(text):
    tokens = [token for token in word_tokenize(text) if token.lower() not in STOPWORDS]
    cleaned = [token for token in tokens if token.isalpha()]
    cleaned_greater_1 = [token for token in cleaned if len(token) > 1]
    return cleaned_greater_1

In [7]:
def text_stream(text_dir):
    # yield each item ocr as (id, tokens) tuple
    for fn in glob.glob(f"{text_dir}/*.txt"):
        item_id = get_item_id(fn)
        with open(fn, 'r') as f:
            document = f.read()
            yield(item_id, tokenize(document))

In [8]:
# text_dir = "texts"
text_dir = "/gdrive/Shared drives/Data & Visualization Services/Workshops/Jumpstart/texts"

In [13]:
head(text_stream(text_dir), n=2)

[('mc00344_118824_20200131_1256',
  ['Stereotaxic',
   'instruments',
   'cat',
   'dog',
   'rabbit',
   'Stereotaxic',
   'instruments',
   'hold',
   'animals',
   'paralyzed',
   'curariform',
   'drugs',
   'general',
   'anesthesia',
   'cases',
   'general',
   'anesthesia',
   'allowed',
   'wear',
   'conscious',
   'animal',
   'kept',
   'instrument',
   'brain',
   'research',
   'storage',
   'refrigerator',
   'expired',
   'medications',
   'bag',
   'black',
   'rotted',
   'apples',
   'fed',
   'monkeys',
   'rewards',
   'experiments',
   'um',
   'um',
   'mu',
   'guns',
   'mom',
   'uuwmt',
   'unrﬂ',
   'ﬂwv',
   'NW',
   'rel',
   'lh',
   'UIt',
   'rqul',
   'whim',
   'new',
   'Int',
   'ot',
   'Ioluuon',
   'shou',
   'Lot',
   'Manon',
   'Duo',
   'medications',
   'expiration',
   'date',
   'September',
   'refrigerator',
   'Photographs',
   'Institute',
   'Behavioral',
   'Research',
   'Monkey',
   'deteriorating',
   'bandage',
   'stayed',
   'w

In [14]:
for item_id, tokens in head(text_stream(text_dir), n=5):
    print(item_id, tokens[:10])

mc00344_118824_20200131_1256 ['Stereotaxic', 'instruments', 'cat', 'dog', 'rabbit', 'Stereotaxic', 'instruments', 'hold', 'animals', 'paralyzed']
mc00456-001-bx0007-013-001 ['Abolition', 'Viviseotion', 'ARRANGEMENT', 'CLAUSES', 'Clause', 'Short', 'title', 'Deﬁnition', 'Vivisection', 'Prohibition']
mc00456-001-bx0002-009-001 ['Birds', 'protected', 'Society', 'Society', 'want', 'Birds', 'Protected', 'word', 'Protection', 'constantly']
mc00456-001-bx0006-034-001 ['TIMES', 'ARTICLE', 'RESULTS', 'EXPERIMENTS', 'LIVING', 'ANIMALS', 'OCTOBER', 'ANSWERED', 'PROFESSOR', 'LAWSON']
mc00456-001-bx0004-004-001 ['DEATH', 'Freeman', 'Cruelty', 'Sport', 'following', 'excerpts', 'representing', 'views', 'historian', 'Freeman']


In [15]:
# You could extract the item_ids from the full text_stream, but in order to not
# tokenize everything when we don't yet need to
item_ids = [get_item_id(fn) for fn in glob.glob(f"{text_dir}/*.txt")]
head(item_ids)

['mc00344_118824_20200131_1256',
 'mc00456-001-bx0007-013-001',
 'mc00456-001-bx0002-009-001',
 'mc00456-001-bx0006-034-001',
 'mc00456-001-bx0004-004-001',
 'mc00620-001-bx0001-034-001',
 'mc00620-001-bx0001-063-001',
 'mc00456-001-bx0002-043-001',
 'mc00456-001-bx0002-042-001',
 'mc00456-001-bx0002-023-001']

In [16]:
doc_stream = (tokens for _, tokens in text_stream(text_dir))

In [17]:
%time id2word_items = gensim.corpora.Dictionary(doc_stream)

CPU times: user 3min 41s, sys: 663 ms, total: 3min 41s
Wall time: 8min 20s


In [18]:
print(id2word_items)

Dictionary(545858 unique tokens: ['Behavioral', 'Detrick', 'Duo', 'Fort', 'Institute']...)


In [19]:
# Filter words based on occurence in docs
# https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes
id2words_items_filtered = id2word_items.filter_extremes(no_below=2, no_above=1)

In [20]:
# We're building this for use in the LDA model and so we can save it to disk for re-use
class ItemCorpus(object):
    def __init__(self, text_dir, dictionary):
        self.text_dir = text_dir
        self.dictionary = dictionary
        
    def __iter__(self):
        self.item_ids = []
        for item_id, tokens in text_stream(text_dir):
            self.item_ids.append(item_id)
            yield self.dictionary.doc2bow(tokens)

In [21]:
item_corpus = ItemCorpus(text_dir, id2word_items)

In [22]:
# Save serialized corpus for later use
%time gensim.corpora.MmCorpus.serialize("./items_bow_lg_full.mm", item_corpus)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 3min 35s, sys: 457 ms, total: 3min 35s
Wall time: 3min 37s


In [23]:
# You can load models from disk as follows:
loaded_corpus = gensim.corpora.MmCorpus("./items_bow_lg_full.mm")
print(loaded_corpus)


MmCorpus(983 documents, 100000 features, 2695221 non-zero entries)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [30]:
# Switch to MulticoreLDa
%time lda_model = gensim.models.LdaModel(loaded_corpus, num_topics=40, id2word=id2word_items, passes=50)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 1h 11min 6s, sys: 27min 45s, total: 1h 38min 51s
Wall time: 51min 23s


In [25]:
lda_model.save('animalturn_40_full.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [26]:
lda_model.print_topics(-1)

[(0,
  '0.003*"Warfare" + 0.003*"blows" + 0.002*"Nations" + 0.002*"mallet" + 0.002*"dislocate" + 0.002*"indicates" + 0.002*"limbs" + 0.002*"violent" + 0.002*"arm" + 0.002*"intense"'),
 (1,
  '0.012*"experiments" + 0.006*"disease" + 0.006*"medical" + 0.005*"cases" + 0.005*"animals" + 0.004*"vivisection" + 0.004*"Vivisection" + 0.004*"human" + 0.004*"experiment" + 0.004*"case"'),
 (2,
  '0.013*"birds" + 0.008*"species" + 0.008*"food" + 0.007*"insects" + 0.007*"mice" + 0.007*"crops" + 0.006*"hawks" + 0.006*"young" + 0.006*"little" + 0.006*"partridges"'),
 (3,
  '0.004*"materials" + 0.003*"vitro" + 0.003*"birds" + 0.003*"INDEX" + 0.003*"shall" + 0.003*"whaling" + 0.003*"MATERIALS" + 0.003*"use" + 0.003*"available" + 0.003*"trade"'),
 (4,
  '0.004*"animals" + 0.003*"water" + 0.003*"acid" + 0.002*"animal" + 0.002*"set" + 0.002*"chemical" + 0.002*"effects" + 0.002*"use" + 0.002*"program" + 0.002*"studies"'),
 (5,
  '0.100*"birds" + 0.045*"bird" + 0.016*"Bird" + 0.015*"cat" + 0.013*"Conures" +

In [27]:
# Write up what this means
lda_model[loaded_corpus[0]]

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[(4, 0.14879134),
 (8, 0.045974493),
 (16, 0.28724435),
 (30, 0.06475382),
 (33, 0.18565476),
 (36, 0.23535001),
 (38, 0.026908671)]

In [28]:
import pyLDAvis
import pyLDAvis.gensim

ModuleNotFoundError: ignored

In [None]:
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.gensim.prepare(lda_model, loaded_corpus, id2word_items)

In [None]:


#visualization following https://www.kaggle.com/ykhorramz/lda-and-t-sne-interactive-visualization#



In [None]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accepts an ldamodel, a topic number and topn terms of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    return terms

In [None]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(20):
    print("Topic " + str(i) + " |---------------------\n")
    tmp = explore_topic(lda_model, topic_number=i, topn=10, output=True)
    topic_summaries += [tmp[:5]]
    print

term                 frequency

Topic 0 |---------------------

Mrs                  0.019
hon                  0.019
branch               0.014
sec                  0.014
RSPCA                0.012
Mr                   0.010
aux                  0.009
Miss                 0.007
animals              0.007
Inspector            0.005
Topic 1 |---------------------

rabbits              0.013
trap                 0.008
true                 0.006
traps                0.006
rabbit               0.005
time                 0.005
fact                 0.005
use                  0.004
necessary            0.004
certain              0.004
Topic 2 |---------------------

Animals              0.006
Humane               0.006
Society              0.005
Boston               0.005
animals              0.005
dog                  0.004
cts                  0.004
little               0.003
Education            0.003
birds                0.003
Topic 3 |---------------------

animals              0.007
man

In [None]:
topic_summaries

[['Mrs', 'hon', 'branch', 'sec', 'RSPCA'],
 ['rabbits', 'trap', 'true', 'traps', 'rabbit'],
 ['Animals', 'Humane', 'Society', 'Boston', 'animals'],
 ['animals', 'man', 'animal', 'life', 'time'],
 ['Rooks', 'ﬁeld', 'Rook', 'corn', 'years'],
 ['pavement', 'block', 'wood', 'pavements', 'paving'],
 ['birds', 'bird', 'rights', 'species', 'time'],
 ['species', 'animals', 'years', 'year', 'time'],
 ['Aux', 'Streak', 'temple', 'Gaskoin', 'Stapledon'],
 ['animals', 'rate', 'response', 'responses', 'stimulus'],
 ['ﬁned', 'Typical', 'Eng', 'Man', 'labour'],
 ['dog', 'ASPCA', 'animals', 'New', 'York'],
 ['Society', 'Miss', 'President', 'Secretary', 'Humane'],
 ['horse', 'll', 'dog', 'Society', 'street'],
 ['whales', 'whaling', 'Japan', 'whale', 'Japanese'],
 ['animals', 'Animal', 'animal', 'research', 'Act'],
 ['animals', 'New', 'York', 'dog', 'school'],
 ['Belinda', 'Woodpigeons', 'Nlrs', 'HOUSING', 'animals'],
 ['animal', 'shall', 'person', 'animals', 'section'],
 ['Pearson', 'poems', 'Jean', 'M

In [None]:
import pandas as pd

In [None]:
source_id = pd.Series(item_ids)

In [None]:
lda_model[loaded_corpus[100]]

[(0, 0.22041495),
 (3, 0.56582475),
 (13, 0.033205844),
 (18, 0.09440457),
 (37, 0.08468529)]

In [None]:
# Create headers for our csv file for each topic
headers = ["source_id"]
for i in range(20):
    headers.append(f"topic-{i}")

In [None]:
headers[:5]

['source_id', 'topic-0', 'topic-1', 'topic-2', 'topic-3']

In [None]:
df = pd.DataFrame(columns=headers)
df

Unnamed: 0,source_id,topic-0,topic-1,topic-2,topic-3,topic-4,topic-5,topic-6,topic-7,topic-8,...,topic-10,topic-11,topic-12,topic-13,topic-14,topic-15,topic-16,topic-17,topic-18,topic-19


In [None]:
for i in range(len(item_ids)):
    item_id = item_ids[i]
    new_row = [item_id]
    for k, i in lda_model[loaded_corpus[i]]:
        new_row.append(i)
    df.loc[item_id] = new_row    

ValueError: cannot set a row with mismatched columns