# Prepare notebook

In [1]:
import matplotlib
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import pickle
import pyLDAvis
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import string
from wordcloud import WordCloud
from plotting_utilities import *

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/acraig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/acraig/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/acraig/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words = 10):
    for topic_idx, topic in enumerate(model.components_):
        message = " Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Import data

In [4]:
books = pd.read_pickle('20191128_goodreads_book_data.pkl').join(
        pd.read_pickle('20191128_google_book_data.pkl')[['isbn', 'categories', 'description']].add_prefix('google_'))

In [5]:
features = pd.DataFrame(index = books.index)

# Parse Goodreads shelves

In [6]:
shelves = pd.get_dummies(books.goodreads_shelves.apply(pd.Series).stack()).sum(level=0)
shelves = shelves.reindex(books.index)
features = features.join(shelves.add_prefix('shelf_'))

# Parse Google books categories

In [7]:
categories = pd.get_dummies(books.google_categories.apply(pd.Series).stack()).sum(level=0)
categories = categories.reindex(books.index)
features = features.join(categories.add_prefix('category_'))

# Parse descriptions 

In [8]:
# Prepare textual data
alphanumeric_filter = re.compile(r'[\W_-]+')
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

In [9]:
def anonymize(text, name):
    names = name.strip().replace(',', '').split(' ')
    for n in names:
        text = text.replace(n, '')
    return text

In [10]:
books['gr_description'] =books[['goodreads_description', 'Author']].fillna('')\
                                    .apply(lambda r: anonymize(r['goodreads_description'].lower(),
                                                               r['Author'].lower()), axis =1)

books['gr_description'] =books.gr_description.fillna('')\
                                     .map(lambda text: alphanumeric_filter.sub(' ', text).lower().strip())\
                                     .map(lambda text: ' '.join([wnl.lemmatize(token) for token in wpt.tokenize(text) 
                                                                 if (token not in stop_words) and 
                                                                    (nltk.pos_tag([token])[0][1] == 'NN') and 
                                                                    (len(token)> 2)
                                                               ]))



In [11]:
books['gb_description'] =books[['google_description', 'Author']].fillna('')\
                                    .apply(lambda r: anonymize(r['google_description'].lower(),
                                                               r['Author'].lower()), axis =1)

books['gb_description'] =books.gb_description.fillna('')\
                                     .map(lambda text: alphanumeric_filter.sub(' ', text).lower().strip())\
                                     .map(lambda text: ' '.join([wnl.lemmatize(token) for token in wpt.tokenize(text) 
                                                                 if (token not in stop_words) and 
                                                                    (nltk.pos_tag([token])[0][1] == 'NN') and 
                                                                    (len(token)> 2)
                                                               ]))



# Parse bios 

In [12]:
# Prepare textual data
alphanumeric_filter = re.compile(r'[\W_]+')
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()


books['gr_bio'] =books.goodreads_author_bio.fillna('')\
                                     .map(lambda x: alphanumeric_filter.sub(' ', x).lower().strip())\
                                     .map(lambda text: ' '.join([wnl.lemmatize(token) for token in wpt.tokenize(text) 
                                                                 if (token not in stop_words) and 
                                                                    (nltk.pos_tag([token])[0][1] == 'NN') and 
                                                                    (len(token)> 2)
                                                               ]))

# Combine text data & select words of interest

In [13]:
books['text_metadata'] = books[['gr_description', 'gb_description', 'gr_bio']].sum(axis =1)

In [14]:
books['text_metadata_mapped'] = books['text_metadata'].apply(lambda s: s.replace('sopher', 'sophy')
                                                                        .replace('trist', 'try')
                                                                        .replace('gist', 'gy')
                                                                        .replace('pist', 'py')
                                                                        .replace('trist', 'try')
                                                                        .replace('scientist', 'science')
                                                                        .replace('historian', 'history')
                                                                        .replace('physicist', 'physics')
                                                                        .replace('chemist', 'chemistry')
                                                                        .replace('economist', 'economics'))

In [15]:
### TF-IDF 
max_df = 1.0
n = 1000


tfidf_v = TfidfVectorizer(use_idf=True, stop_words = ['audiobook', 
                                                      'new', 'york', 'time', 'times', 'america',
                                                      'today', 'year',
                                                      'bestselling', 'bestseller', 'award', 'winning', 'prize',
                                                      'book', 'series', 'story', 'edition',
                                                      'way', 'born', 'work', 'become', 'set', 'include'],
                          max_df=max_df,
                          binary= True)
tfidf_weights = tfidf_v.fit_transform(books['text_metadata_mapped'])

weights = tfidf_weights.toarray().sum(axis = 0)
weighted_words = tfidf_v.get_feature_names()

idx = np.argpartition(weights, -n)[-n:]

In [16]:
features = features.join(pd.DataFrame(data = tfidf_weights.toarray()[:, idx],
                                     columns = [weighted_words[i] for i in idx],
                                     index = books.index).add_prefix('text_'))

# Pre-processed publisher data

In [17]:
publishers= pd.get_dummies(pd.read_pickle('20191128_normalized_publisher.pkl'))

In [18]:
features = features.join(publishers)

# A priori filtering of features with little coverage

In [23]:
features.shape

(610, 1510)

In [19]:
coverage = (features.sum()/len(features))

In [26]:
selected_features = coverage[coverage>=0.005].index

In [27]:
selected_features.tolist()

['shelf_adult',
 'shelf_adventure',
 'shelf_africa',
 'shelf_african-american',
 'shelf_american-history',
 'shelf_ancient-history',
 'shelf_animals',
 'shelf_anthropology',
 'shelf_archaeology',
 'shelf_art',
 'shelf_art-history',
 'shelf_asia',
 'shelf_astronomy',
 'shelf_audiobook',
 'shelf_australia',
 'shelf_autobiography',
 'shelf_biography',
 'shelf_biography-memoir',
 'shelf_biology',
 'shelf_birds',
 'shelf_books-about-books',
 'shelf_brain',
 'shelf_buddhism',
 'shelf_business',
 'shelf_chemistry',
 'shelf_china',
 'shelf_classics',
 'shelf_comedy',
 'shelf_cooking',
 'shelf_design',
 'shelf_dogs',
 'shelf_drama',
 'shelf_ecology',
 'shelf_economics',
 'shelf_education',
 'shelf_egypt',
 'shelf_environment',
 'shelf_essays',
 'shelf_evolution',
 'shelf_fairy-tales',
 'shelf_fantasy',
 'shelf_feminism',
 'shelf_fiction',
 'shelf_folklore',
 'shelf_food',
 'shelf_genetics',
 'shelf_geology',
 'shelf_health',
 'shelf_historical',
 'shelf_historical-fiction',
 'shelf_history',
 '

In [28]:
features[selected_features].shape

(610, 356)