# Prepare notebook

In [1]:
import matplotlib
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import pickle
import pyLDAvis
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import string
from wordcloud import WordCloud
from plotting_utilities import *

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/acraig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/acraig/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/acraig/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words = 10):
    for topic_idx, topic in enumerate(model.components_):
        message = " Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Import data

In [3]:
books = pd.read_pickle('20191128_goodreads_book_data.pkl').join(
        pd.read_pickle('20191128_google_book_data.pkl')[['isbn', 'categories', 'description']].add_prefix('google_'))

# Parse Goodreads shelves

In [194]:
shelves_strings = books.goodreads_shelves.apply(lambda l: ' '.join(l))

In [213]:
pattern = "(?u)\\b[\\w-]+\\b"
max_df = 0.9
min_df = 4

In [214]:
### Bag of words
word_bag_cv = CountVectorizer(token_pattern=pattern, stop_words = ['audiobook'],
                              max_df=max_df, min_df=min_df, 
                              ngram_range=(1,1), binary= True)
word_bag_counts = word_bag_cv.fit_transform(shelves_strings)

n = 10
counts = word_bag_counts.toarray().sum(axis = 0)
words = word_bag_cv.get_feature_names()
[words[i] for i in np.argpartition(counts, -n)[-n:]] 

['animals',
 'philosophy',
 'biology',
 'nature',
 'memoir',
 'science',
 'biography',
 'psychology',
 'non-fiction',
 'history']

In [215]:
### TF-IDF 
tfidf_v = TfidfVectorizer(use_idf=True, stop_words = ['audiobook'],
                          max_df=max_df, min_df=min_df, 
                          token_pattern=pattern,
                          binary= True)
tfidf_weights = tfidf_v.fit_transform(shelves_strings)
n = 10
weights = tfidf_weights.toarray().sum(axis = 0)
weighted_words = tfidf_v.get_feature_names()
[weighted_words[i] for i in np.argpartition(weights, -n)[-n:]] 

['animals',
 'philosophy',
 'biology',
 'psychology',
 'nature',
 'memoir',
 'biography',
 'science',
 'non-fiction',
 'history']

In [222]:
n_components = 18


In [223]:
# Fit LDA model 
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(word_bag_counts)
print("\nTopics in LDA model:")
print_top_words(lda, word_bag_cv.get_feature_names())


Topics in LDA model:
 Topic #0: non-fiction self-help biology natural-history psychology geology personal-development evolution leadership science
 Topic #1: classics fiction mythology fantasy literature plays short-stories drama japan theatre
 Topic #2: non-fiction history science biography feminism nature psychology birds art writing
 Topic #3: non-fiction nature history animals memoir science biography natural-history reference queer
 Topic #4: poetry plays classics japan literature japanese-literature drama buddhism autobiography theatre
 Topic #5: animals health medicine science nature china economics philosophy cooking comedy
 Topic #6: non-fiction politics biography psychology history memoir medieval personal-development art social-justice
 Topic #7: science non-fiction nature biology history psychology animals natural-history philosophy evolution
 Topic #8: historical-fiction africa china natural-history drama plants animals science-nature neuroscience memoir
 Topic #9: folklo

In [224]:
lda_data =  pyLDAvis.prepare(topic_term_dists = lda.components_, 
                             doc_topic_dists = lda.transform(word_bag_counts.toarray()), 
                             doc_lengths = shelves_strings.apply(len),
                             vocab = word_bag_cv.get_feature_names(),
                             term_frequency = word_bag_counts.toarray().sum(axis = 0),
                             sort_topics=True)
pyLDAvis.display(lda_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [225]:
# Fit LDA model 
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tfidf_weights)
print("\nTopics in LDA model:")
print_top_words(lda, tfidf_v.get_feature_names())


Topics in LDA model:
 Topic #0: natural-history biology nature science animals non-fiction geology art medicine art-history
 Topic #1: plays classics fiction drama theatre mystery mythology school history japanese-literature
 Topic #2: feminism fiction art non-fiction queer folklore microhistory psychology australia race
 Topic #3: nature science biology animals non-fiction natural-history environment evolution history plants
 Topic #4: microhistory history science neuroscience physics autobiography sociology natural-history asia non-fiction
 Topic #5: food cooking history non-fiction science microhistory animals travel health nature
 Topic #6: astronomy physics science non-fiction business economics psychology memoir lgbt biography
 Topic #7: philosophy non-fiction classics science history psychology sociology neuroscience politics physics
 Topic #8: historical-fiction natural-history technology africa business literature science essays china drama
 Topic #9: fiction short-stories fo

In [226]:
lda_data =  pyLDAvis.prepare(topic_term_dists = lda.components_, 
                             doc_topic_dists = lda.transform(tfidf_weights.toarray()), 
                             doc_lengths = shelves_strings.apply(len),
                             vocab = tfidf_v.get_feature_names(),
                             term_frequency = tfidf_weights.toarray().sum(axis = 0),
                             sort_topics=True)
pyLDAvis.display(lda_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [227]:
# Fit the NMF model (generalized Kullback-Leibler divergence)
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf_weights)
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
print_top_words(nmf, tfidf_v.get_feature_names())


Topics in NMF model (generalized Kullback-Leibler divergence):
 Topic #0: science nature non-fiction natural-history animals biography plants birds science-nature technology
 Topic #1: biography memoir non-fiction autobiography biography-memoir history italy australia poetry school
 Topic #2: non-fiction psychology business science self-help personal-development education leadership mental-health sociology
 Topic #3: history non-fiction historical ancient-history china egypt medieval medieval-history asia war
 Topic #4: philosophy non-fiction religion buddhism self-help spirituality poetry politics war drama
 Topic #5: classics biology philosophy popular-science ancient-history school egypt comedy cooking design
 Topic #6: fiction classics mythology fantasy literature japan drama plays japanese-literature historical-fiction
 Topic #7: essays non-fiction feminism politics race african-american sociology social-justice american-history education
 Topic #8: health medicine medical non-fi