# Extraction of all tokens for all books in the science fiction and the random corpora.

Code for extracting tokens from the corpora using the HTRC Extracted Features files and the HTRC Feature Reader

### 1. Extract htids for both the scifi and random corpus

In [0]:
!pip install htrc-feature-reader



In [0]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
import pandas as pd
import os
import sys
import glob
import gzip
from   htrc_features import FeatureReader, utils as frutils
from   nltk.stem import WordNetLemmatizer
import time
import csv

# Directories for input and output
figDir = 'figures'
resultsDir = 'results'
inputDir = 'inputs'

# Full corpus data can be large; make it easy to stash outside GitHub/Google
bigDir = '.' # Base directory for large files
htrcefDir = os.path.join(bigDir, 'htrcef') # HTRC-EF JSONs

In [0]:
def creating_htid_list(csv_path):

    with open(csv_path, 'r', encoding='utf-8') as csv_file:
        dict_csv = csv.DictReader(csv_file)
        list_htids = [row["htid"] for row in dict_csv]

        return list_htids

In [0]:
# Extract htids for the scifi corpus
scifi_htids = creating_htid_list("scifi_metadata_htids.csv")
print(len(scifi_htids))

331


In [0]:
# Extract htids for the random corpus
random_htids = creating_htid_list("random_metadata_htids_BUG.csv")
print(len(random_htids))

15874


In [0]:
# Download the extracted features files for all volumes in the corpus
frutils.download_file(htids=scifi_htids, outdir=htrcefDir)

(0, None)

In [0]:
# Download the extracted features files for all volumes in the corpus
frutils.download_file(htids=random_htids, outdir=htrcefDir)

(0, None)

### 2. Preparation to clean the tokens: removal of stopwords, punctuation, lemmatization

In [0]:
# Prepare list of stop words
stoplist_file = 'stopwords-underwood-goldstone.txt'
stoplist = [line.strip() for line in open(stoplist_file)]
stoplist = set(stoplist)
print("Words in stoplist:", len(stoplist))

Words in stoplist: 6048


In [0]:
# Functions to work with EF volumes
def encode_volid(volid, direction='path'):
    '''
    Transform htid into filename encoded version and vice versa
    '''
    encoding_fixes = {'+':':', '=':'/'}
    if direction=='path':
        encoding_fixes = {v:k for k,v in encoding_fixes.items()}
    for key in encoding_fixes:
        volid = volid.replace(key, encoding_fixes[key])
    return(volid)


# Penn treebank tags to keep
pos_to_include = [
    'FW',  # foreign
    'JJ',  # adjectives
    'JJR',
    'JJS',
    'MD',  # modal
    'NN',  # nouns (not proper)
    'NNS',
    'RB',  # adverbs
    'RBR',
    'RBS',
    'VB',  # verbs
    'VBD',
    'VBG',
    'VBN',
    'VBP',
    'VBZ'
]


# Translate Penn->WordNet PoS tags
#  Need WordNet PoS tags for lemmatizer
def get_wordnet_pos(treebank_tag):
    from nltk.corpus import wordnet
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('M'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    

### 3. Core functions
Extraction of the tokens from the dataframe created by the FeatureReader; cleaning of the tokens; list of lists creation; combination of the two separate lists, one for each corpus;

In [0]:
#Lemmatize the token and create a lemmas dict to have unique lemmas
def dict_extraction(token, lemmas):

    word = token.Index[0]
    pos = get_wordnet_pos(token.Index[1])
    lem_word = lemmatizer.lemmatize(word, pos=pos)
    count = token.count
    if lem_word not in lemmas.keys():
        lemmas[lem_word] = count
    else:
        lemmas[lem_word] += count
            
    return lemmas

In [0]:
# Extract lemmatized tokens for each volume
lemmatizer = WordNetLemmatizer() # Initialize lemmatizer


def vol_extraction(volid, corpus):
    
    vol = FeatureReader(os.path.join(htrcefDir,f'{encode_volid(volid)}.json.bz2')).first()
    skip_last = vol.page_count - 10 #calculate total number of pages per book to skip the last ten pages
    year = vol.year
#Slice dataframe: only the central part of the page, only the mid of the book, only tokens with a valid PoS 
    vol_df = vol.tokenlist(case=False, section='body').loc[10:skip_last].query('pos in @pos_to_include').groupby(level=[2, 3]).sum()
    total_tokens = vol_df["count"].sum() #total number of tokens per book to calculate the occ 100k    
#Retrieve each row and only compute the function list_extraction if the token is not a stopword
    lemmas = dict()
    for token in vol_df.itertuples():
        if token.Index[0] not in stoplist and token.count > 1:
            lemmas = dict_extraction(token, lemmas)
    vol_lemmas = ([key, value, round((value / total_tokens * 100000), 2)] for key, value in lemmas.items())
    
    return year, vol_lemmas

In [0]:
# Extract lemmatized tokens for the entire corpus
def corpus_extraction(htids_list, corpus):
    
    results = dict()
    for volid in htids_list:
        year, vol_lemmas = vol_extraction(volid, corpus)
        if year not in results.keys():
          results[year] = dict()
          results[year][volid] = list(vol_lemmas)
        else:
          results[year][volid] = list(vol_lemmas)
    
    return results

In [0]:
start1 = time.perf_counter()
scifi_lemmas = corpus_extraction(scifi_htids, "scifi")
end1 = time.perf_counter()
print(end1 - start1)

208.35502319199986


In [0]:
###CHANGE CSV BEFORE RUNNING
start = time.perf_counter()
random_lemmas = corpus_extraction(random_htids, "random")
end = time.perf_counter()
print(end - start)

9140.770264236002


In [0]:
combined = dict()
combined["scifi"] = scifi_lemmas
combined["random"] = random_lemmas

### 4. Creation of the dataframe and storing in a csv compressed file

In [0]:
with open(os.path.join(bigDir,"termsdata.csv"),'w', encoding='utf-8', newline='') as termsdata:
  writer = csv.writer(termsdata)
  writer.writerow(("corpus", "year", "htid", "lem_word", "count", "occurs_100k"))
  for corpus, years in combined.items():
    for year, volumes in years.items():
      for volume, lemmas in volumes.items():
        for lemma in lemmas:
          writer.writerow((corpus, year, volume, lemma[0], str(lemma[1]), str(lemma[2])))


In [0]:
import gzip
import shutil

with open('termsdata.csv', 'rb') as f_in:
    with gzip.open('termsdata.csv.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
