## Prepare info for page selection

In [0]:
!pip install htrc-feature-reader

Collecting htrc-feature-reader
  Downloading https://files.pythonhosted.org/packages/49/c6/e5916b4c27d00c600c08da44637732b657c3b2c84f0c2e7cbeb0d2b11b7a/htrc-feature-reader-1.99.tar.gz
Collecting ujson
[?25l  Downloading https://files.pythonhosted.org/packages/16/c4/79f3409bc710559015464e5f49b9879430d8f87498ecdc335899732e5377/ujson-1.35.tar.gz (192kB)
[K     |████████████████████████████████| 194kB 9.8MB/s 
[?25hCollecting pymarc
[?25l  Downloading https://files.pythonhosted.org/packages/67/3e/1c4b239d179b2a24e8288ad4ae8f87a667bf5acb4c7907c68e3539ab9284/pymarc-3.1.13.tar.gz (214kB)
[K     |████████████████████████████████| 215kB 17.1MB/s 
Building wheels for collected packages: htrc-feature-reader, ujson, pymarc
  Building wheel for htrc-feature-reader (setup.py) ... [?25l[?25hdone
  Created wheel for htrc-feature-reader: filename=htrc_feature_reader-1.99-cp36-none-any.whl size=13605 sha256=7f3d23a9ed365e0ae5a7d7d51b479a64489caa4221aee91530c8ed4ae81117b7
  Stored in directory: /r

In [0]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
import pandas as pd
import os
import gzip
from   htrc_features import FeatureReader, utils as frutils
from   nltk.stem import WordNetLemmatizer
import time
import csv

# Full corpus data can be large; make it easy to stash outside GitHub/Google
bigDir = '.' # Base directory for large files
htrcefDir = os.path.join(bigDir, 'htrcef') # HTRC-EF JSONs

In [0]:
htids = [line.strip() for line in open("scifi_htids.txt")]
print("N. scifi novels:", len(htids))

N. scifi novels: 331


In [0]:
# Download the extracted features files for all volumes in the corpus
frutils.download_file(htids=htids, outdir=htrcefDir)

(0, None)

In [0]:
import csv

def creating_terms_list(csv_path):

    with open(csv_path, 'r', encoding='utf-8') as csv_file:
        dict_csv = csv.DictReader(csv_file)
        list_terms = [row["term"] for row in dict_csv]

        return list_terms

In [0]:
# Extract urban terms collected by hand
urbanterms = creating_terms_list("urbanterms.csv")
print(len(urbanterms))

78


## Extract page information

In [0]:
# Functions to work with EF volumes
def encode_volid(volid, direction='path'):
    '''
    Transform htid into filename encoded version and vice versa
    '''
    encoding_fixes = {'+':':', '=':'/'}
    if direction=='path':
        encoding_fixes = {v:k for k,v in encoding_fixes.items()}
    for key in encoding_fixes:
        volid = volid.replace(key, encoding_fixes[key])
    return(volid)


# Penn treebank tags to keep
pos_to_include = [
    'FW',  # foreign
    'JJ',  # adjectives
    'JJR',
    'JJS',
    'MD',  # modal
    'NN',  # nouns (not proper)
    'NNS',
    'RB',  # adverbs
    'RBR',
    'RBS',
    'VB',  # verbs
    'VBD',
    'VBG',
    'VBN',
    'VBP',
    'VBZ'
]


# Translate Penn->WordNet PoS tags
#  Need WordNet PoS tags for lemmatizer
def get_wordnet_pos(treebank_tag):
    from nltk.corpus import wordnet
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('M'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    

In [0]:
#Lemmatize the token and create a lemmas dict to have unique lemmas
lemmatizer = WordNetLemmatizer() # Initialize lemmatizer

def dict_extraction(token, urbterms):
    
    pages = set()
    page = token.Index[0]
    word = token.Index[1]
    pos = get_wordnet_pos(token.Index[2])
    lem_word = lemmatizer.lemmatize(word, pos=pos)
    if lem_word in urbanterms: #here to include non lemmatized versions
        pages.add(page)
            
    return pages

In [0]:
%%time
urban_pages = dict()
for volid in htids:
    vol = FeatureReader(os.path.join(htrcefDir,f'{encode_volid(volid)}.json.bz2')).first()
    skip_last = vol.page_count - 10
    vol_df = vol.tokenlist(case=False, section='body').loc[10:skip_last].groupby(level=[0, 2, 3]).sum()
    htid_pages = set()
    for token in vol_df.itertuples():
        token_pages = dict_extraction(token, urbanterms)
        htid_pages.update(token_pages)
    
    urban_pages[volid] = htid_pages

CPU times: user 5min 26s, sys: 5.07 s, total: 5min 31s
Wall time: 5min 32s


In [0]:
print(urban_pages)

{'nyp.33433076024060': {129, 132, 133, 144, 145, 275, 276, 149, 150, 277, 24, 153, 26, 27, 28, 278, 280, 281, 286, 34, 290, 37, 166, 39, 168, 165, 293, 294, 296, 45, 48, 304, 305, 180, 53, 54, 55, 310, 313, 189, 190, 191, 193, 66, 195, 196, 74, 205, 213, 217, 90, 228, 101, 229, 103, 104, 105, 230, 107, 108, 231, 110, 120}, 'inu.30000042750632': {129, 131, 137, 11, 13, 17, 145, 21, 23, 151, 25, 153, 27, 155, 29, 157, 37, 171, 45, 173, 47, 175, 177, 51, 179, 53, 57, 59, 61, 189, 193, 67, 195, 71, 73, 201, 75, 203, 81, 209, 85, 213, 87, 247, 93, 223, 99, 101, 103, 105, 233, 111, 241, 115, 119, 123, 127}, 'njp.32101021206436': {256, 321, 322, 324, 326, 329, 205, 79, 143, 335, 146, 19, 211, 213, 22, 23, 214, 215, 216, 340, 345, 346, 225, 162, 228, 292, 293, 40, 109, 46, 301, 302, 303, 304, 119, 315, 125, 191}, 'nyp.33433074954656': {130, 131, 10, 13, 142, 15, 143, 144, 146, 21, 153, 154, 156, 29, 157, 158, 159, 34, 35, 36, 39, 167, 169, 172, 47, 51, 52, 53, 54, 56, 57, 58, 59, 185, 191, 192

In [0]:
with open(os.path.join(bigDir,"pagesdata.csv"),'w', encoding='utf-8', newline='') as data:
    writer = csv.writer(data)
    writer.writerow(("htid", "page", "term"))
    for htid, pages in urban_pages.items():
        for page in pages:
            writer.writerow((htid, page))