In [40]:
from gensim.test.utils import datapath
from gensim import corpora 
from gensim.models.ldamodel import LdaModel 
from gensim.corpora.dictionary import Dictionary

In [9]:
import glob
from htrc_features import FeatureReader

In [4]:
temp_file = datapath("PrelimTopicModel2")

lda_model = LdaModel.load(temp_file)

In [10]:
paths = glob.glob('../data/testfiles/*.bz2')

In [16]:
import string
import nltk 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS 

ADD_PUNC = '”“’’–˙ˆ‘'
STOPWORDS = {'d', 'c', 'e', 's', 'œ', 'dhs', 'hk', 'nagy', 'eology', 'ey', 'g', 'ing', 'tion', 'er', 'rst', 'vol', 'ed'} 
AUTHOR_NAMES = {'cruz', 'frederiks', 'nagy', 'snyder', 'nguyen', 'prior', 'cavanaugh', 'heyer', 'schmil', 'smith', 'groody', 'campese', 'izuzquiza', 'heimburger', 'myers', 'colwell', 'olofinjana', 'krabill', 'norton', 'theocharous', 'nacpil', 'nnamani', 'soares', 'thompson', 'zendher', 'ahn', 'haug', 'sarmiento', 'davidson', 'rowlands', 'strine', 'zink', 'jimenez'}
STOPWORDS = STOPWORDS.union(AUTHOR_NAMES)
STOPWORDS = STOPWORDS.union(ENGLISH_STOP_WORDS)
PUNCDIG_TRANSLATOR = str.maketrans('', '', string.punctuation+string.digits+ADD_PUNC)

def text_clean(text):
    clean_list = []
    words = nltk.word_tokenize(text)
    for w in words:
        if w not in STOPWORDS and len(w) > 2: # removing two character words
            w = w.translate(PUNCDIG_TRANSLATOR)
            if w != '':
                clean_list.append(lemmatizer.lemmatize(w))
    return clean_list

lemmatizer = WordNetLemmatizer()

def volume_parser(vol):
    vol_list = []
    for page in vol.pages():
        df = page.tokenlist('body', case=False, pos=False)
        dicty = df.to_dict()
        count = dicty['count']
        clean_list = []
        for key in count.keys():
            w = key[2]
            if w not in STOPWORDS and len(w) > 2: # removing two character words
                w = w.translate(PUNCDIG_TRANSLATOR)
                if w != '':
                    clean_list += [lemmatizer.lemmatize(w)] * count[key]
                    # clean_list.append(lemmatizer.lemmatize(w) * count[key])
        vol_list.append(clean_list)
    return vol_list

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sgoodwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sgoodwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
def vol_gen(paths):
    fr = FeatureReader(paths)
    for vol in fr.volumes():
        yield vol

In [31]:
vol_generation = vol_gen(paths)
vol = next(vol_generation)

In [32]:
vol.title

'The 12-year Reich : a social history of Nazi Germany, 1933-1945 / Richard Grunberger.'

In [33]:
vol_list = volume_parser(vol)

In [41]:
corpus_dict = Dictionary.load_from_text('./models/corpus_dictionary_2')

In [42]:
corpus_list = [corpus_dict.doc2bow(text) for text in vol_list]

In [57]:
def analyze_corpus_with_model(other_corpus, lda_model):
    pot_match = []
    for doc_num, doc in enumerate(other_corpus):
        vector = lda_model[doc]
        # row = sorted(vector[0], key=lambda x: x[1], reverse=True)
        row = vector[0]
        topic_num, prop_topic = row[0]
        if topic_num in (0, 1, 3, 5, 6, 11) and prop_topic > .04:
            pot_match.append((doc_num, topic_num, prop_topic))
    return pot_match

    '''
    sorted_list = sorted(pot_match, key=lambda x: x[-1], reverse=True)
    return sorted_list
    '''

In [58]:
sorted_list = analyze_corpus_with_model(corpus_list, lda_model)

In [60]:
max(sorted_list, key=lambda x: x[-1])

(482, 6, 0.7523192)

In [45]:
max(sorted_list, key=lambda x: x[-1])

(482, 6, 0.752285)

In [46]:
import timeit
def wrapper(func, *args, **kwargs):
    def wrapped():
        return func(*args, **kwargs)
    return wrapped

In [47]:
wrapped = wrapper(max, sorted_list, key=lambda x: x[-1])

In [48]:
timeit.timeit(wrapped)

17.595497417999468

In [49]:
def list_sort(lst):
    return sorted(lst, key=lambda x: x[-1])

In [50]:
wrapped_sort = wrapper(list_sort, sorted_list)

In [51]:
timeit.timeit(wrapped_sort)

20.28719097300018

In [53]:
vector = lda_model[corpus_list[100]]

In [54]:
vector 

([(1, 0.25644043),
  (5, 0.120921254),
  (6, 0.0846427),
  (10, 0.12059769),
  (12, 0.25669894),
  (21, 0.113198526)],
 [(5, [5]),
  (11, [1, 6]),
  (16, [1, 12, 21, 5]),
  (22, [12, 5]),
  (26, [10]),
  (28, [1]),
  (38, [21]),
  (51, [1, 21, 5]),
  (56, [12, 5, 21]),
  (67, [10, 21, 5, 12, 6]),
  (68, [12, 5]),
  (76, [10, 12, 21]),
  (79, [6]),
  (89, [12, 21])],
 [(5, [(5, 0.9999911)]),
  (11, [(1, 1.7476194), (6, 0.2523553)]),
  (16, [(1, 0.42299974), (5, 0.018924559), (12, 0.3835322), (21, 0.17453243)]),
  (22, [(5, 0.27588192), (12, 0.7240828)]),
  (26, [(10, 0.9999945)]),
  (28, [(1, 0.9903614)]),
  (38, [(21, 0.99999326)]),
  (51, [(1, 0.9021005), (5, 0.019916726), (21, 0.07797805)]),
  (56, [(5, 0.11033381), (12, 0.8119678), (21, 0.07768639)]),
  (67,
   [(5, 0.18650338),
    (6, 0.062213406),
    (10, 0.36962354),
    (12, 0.09858051),
    (21, 0.28307194)]),
  (68, [(5, 0.2804311), (12, 0.7195571)]),
  (76, [(10, 0.51883733), (12, 0.43491545), (21, 0.046241388)]),
  (79, [(