# Cookbook Processing

This notebook outlines the data collection and preparation for a set of cookbooks in the HathiTrust Digital Library. Analysis is in the other notebooks.

In [4]:
from htrc_features import FeatureReader, utils
from htrc import workset
import pandas as pd
import os
from dask.delayed import delayed
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import spacy
nlp = spacy.load('en')

In [275]:
collection = workset.load_hathitrust_collection('https://babel.hathitrust.org/cgi/mb?a=listis&c=494231066')[1:] # First work is a dud
paths = ['../features/'+utils.id_to_rsync(id) for id in collection]
paths = [path for path in paths if os.path.exists(path)]
"Of a %d volume workset, %d have feature files" % (len(collection), len(paths))

'Of a 2016 volume workset, 1781 have feature files'

# Save Metadata

In [343]:
meta_attrs = ['id', 'rights_attributes', 'title', 'year', 'classification', 'author', 'pub_place', 'page_count']
metas = []
fr = FeatureReader(paths)
for vol in fr.volumes():
    meta = pd.Series([getattr(vol, attr) for attr in meta_attrs], index=meta_attrs)
    metas.append(meta)
all_meta = pd.DataFrame(metas)
all_meta.author = all_meta.author.apply(lambda x: x[0] if len(x) != 0 else '')
all_meta.classification = all_meta.classification.apply(lambda x: x['lcc'][0] if 'lcc' in x else '')
all_meta.sample(5)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 12.9 µs


Unnamed: 0,id,rights_attributes,title,year,classification,author,pub_place,page_count
1174,uc1.31822031039282,pd,Natural food recipes / Mrs. B. Stanford Claunch,1939,TX837 .C5,"Claunch, B. Stanford Mrs",cau,52
280,uc1.31822031043284,ic,Cesar salads / by Judy Hogness ; graphics: R. ...,1977,,"Hogness, Judy",cau,84
1169,uc1.31822031042138,pd,"Namco brand crab, the world's finest",1927,,"Bradley, Alice 1875-1946",cau,16
967,uc1.31822031039365,pd,Kampkookery and useful hints for the motor camper,1927,,American Gas Machine Company,mnu,36
1519,uc1.31822031043268,ic,"The Sonoma Dried Tomato cookbook, or, What to ...",1992,,"Waltenspiel, Ronald",cau,156


# Determine a vocabulary

Here, I count up all the words, by POS, and use that to trim the wordlist to interesting words.

In [14]:
# Multithread with dask
def tokenlist_from_path(path):
    try:
        df = (FeatureReader(path)
                 .first()
                 .tokenlist(pages=False)
                 .reset_index()
                 .drop('section', 1))
        assert df.columns.tolist() == ['token', 'pos', 'count']
    except:
        df = pd.DataFrame(columns=['token','pos','count'])
    return df

delayed_dfs = [delayed(tokenlist_from_path)(path) for path in paths]
ddf = dd.from_delayed(delayed_dfs)

with ProgressBar():
    total_word_counts = ddf.groupby(['token', 'pos']).sum().compute().sort_index()

[########################################] | 100% Completed | 11min 35.2s


In [337]:
# Stoplist and lemmatize. This is mostly for the normalizing and filtering verbs,
# but doesn't do anything regretful to the nouns
def lemmatize(word):
    return list(nlp(word))[0].lemma_

# Keep nouns, verbs, and adjectives
pos_filter = ['NNP', 'NN', 'NNS', 'NNPS'] + ['VB', 'VBN', 'VBD', 'VBZ', 'VBG', 'VBP'] + ['JJ', 'JJR', 'JJS']
stoplist = total_word_counts.index.get_level_values('token').isin(spacy.en.STOPWORDS)

pass1 = (total_word_counts[~stoplist]
             .query('count > 20')
             .loc[(slice(None), pos_filter),]
             .reset_index()
        )
pass1['lemma'] = pass1.token.apply(lemmatize)

pass2 = (pass1.query('lemma != ""')
              .groupby('lemma', as_index=False)[['count']].sum()
              .query('count > 100')
        )

# Final dictionary with keyed lemmas
dictionary = pass2.sort_values('count', ascending=False).lemma.reset_index().rename(columns={'index':'id', 'lemma': 'token'})
dictionary['id'] = range(0, len(dictionary))
dictionary.head()

Unnamed: 0,id,token
0,0,cup
1,1,add
2,2,water
3,3,salt
4,4,cook


In [282]:
# Also key the volume ids
volids = pd.Series(collection).reset_index().rename(columns={'index':'volid',0:'htid'})
volids_dict = volids.set_index('htid').to_dict('index')

In [360]:
# Save all the processed info.
with pd.HDFStore('cookbooks/ref.h5', mode='w', complib='blosc') as store:
    store.append('metadata', all_meta)
    store.append('global_frequencies', total_word_counts.query('count>10'))
    store.append('dictionary', dictionary)
    store.append('volids', volids)

# Save Page-level integer-keyed token counts

In [356]:
a = total_word_counts.sample(10)
a

Unnamed: 0_level_0,Unnamed: 1_level_0,count
token,pos,Unnamed: 2_level_1
GROGAN,NNP,2
山己​シ,NN,1
Flueggea,NNP,3
Strawberry-nut,NNP,1
ΕκφωιΜκο,NN,1
AÑÜS,NNP,1
左,NN,1
dressheat,NN,1
cnﬁrns,NNS,1
​勢​,NN,1


In [359]:
a.index.values

array([('GROGAN', 'NNP'), ('山己\u200bシ', 'NN'), ('Flueggea', 'NNP'),
       ('Strawberry-nut', 'NNP'), ('ΕκφωιΜκο', 'NN'), ('AÑÜS', 'NNP'),
       ('左', 'NN'), ('dressheat', 'NN'), ('cnﬁrns', 'NNS'),
       ('\u200b勢\u200b', 'NN')], dtype=object)

In [348]:
len(dictionary)

13529

In [None]:
def page_token_freq(path):
    ''' Get token frequencies for words in the dictionary, per page and with volumes and tokens integer-keyed'''
    try:
        vol = FeatureReader(path).first()
        tl = (vol.tokenlist()
                 .loc[(slice(None), slice(None), slice(None), pos_filter),]
                 .reset_index()
                 .drop('section', 1)
             )
        assert tl.columns.tolist() == ['page', 'token', 'pos', 'count']
        tl['lemma'] = tl.token.apply(lemmatize)
        tl_folded = tl.groupby(['page', 'lemma'], as_index=False)[['count']].sum().rename(columns={'lemma':'token'})
        tl_ids = pd.merge(tl_folded, dictionary)[['page', 'id', 'count']]
        tl_ids['volid'] = volids_dict[vol.id]['volid']
    except:
        return pd.DataFrame(columns=['page', 'id', 'count', 'volid'])
    return tl_ids

with ProgressBar():
    delayed_dfs = [delayed(page_token_freq)(path) for path in paths]
    ddf = dd.from_delayed(delayed_dfs)
    ddf.to_hdf('cookbooks/cookbook.*.h5', 'counts')

[########################################] | 100% Completed |  0.6s
[###################################     ] | 88% Completed |  1hr 14min  5.4s