# 0 - Preflight checks

Import all of the relevant packages that will be used later.

Functions that will be used for cleaning in the main part of the code.

In [0]:
import time
import csv
import re
import string

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.collocations import *
from nltk.corpus import wordnet as wn

import spacy
import pandas as pd

In [0]:
def encode_volid(volid, direction='path'):
    '''
    Transform htid into filename encoded version and vice versa
    '''
    encoding_fixes = {'+':':', '=':'/'}
    if direction=='path':
        encoding_fixes = {v:k for k,v in encoding_fixes.items()}
    for key in encoding_fixes:
        volid = volid.replace(key, encoding_fixes[key])
    return(volid)

#Reformat page number to find corresponding json files
def find_page(page_n):
    n = len(page_n)
    N = 8 - n
    zeros = ""
    while N > 0:
        zeros += "0"
        N -= 1
    return zeros + page_n


# Prepare list of urbanterms
with open("urbanterms.csv", 'r', encoding='utf-8') as csv_file:
    dict_csv = csv.DictReader(csv_file)
    urbanterms = [row["term"] for row in dict_csv]


# Prepare list of stop words
stoplist_file = 'stopwords-underwood-goldstone.txt'
stoplist = [line.strip() for line in open(stoplist_file)]
stoplist = set(stoplist)


def cleanText(text):
    """ 
    A quick cleaner function to remove any unwanted noise from novels.
    """
    #strip punctuation
    text = "".join(l for l in text if l not in string.punctuation)
    text = re.sub(r"[^\w\d'\s]+",'',text)
    # whitespace
    text = text.strip().replace("\n", " ").replace("\r", " ").replace("  "," ")
    # lowercase
    text = text.lower()
    return text


def context_extraction(tags, keyword):
    numwords = 5
    context = []
    for x in [x for (x, y) in enumerate(tags) if keyword == y[0]]:
        context.extend(tags[x-numwords:x])
        context.extend(tags[x+1:x+numwords+1])    
    return context


def spacyTags(token):
    """
    Select only those spaCy token attributes that we want to work with
    """
    tags = (token.lemma_, token.pos_)
    return tags

# 1 - Create htid - page dictionary

From csv file create a dictionary with htids as keys and lists of pages' numbers

In [0]:
htids_pages = dict()
with open("pagesdata.csv", 'r', encoding='utf-8') as csv_file:
        dict_csv = csv.DictReader(csv_file)
        for row in dict_csv:
            if row["htid"] not in htids_pages.keys():
                htids_pages[row["htid"]] = set()
                htids_pages[row["htid"]].add(str(row["page"]))
            else:
                htids_pages[row["htid"]].add(str(row["page"]))

print("N. scifi novels:", len(htids_pages))

# 2 - Create dictionary with keywords, htids and contexts

Point this bit in the direction of the novel you want to study.

spaCy is happier to tag each page than the whole book.

In [0]:
nlp = spacy.load('en', disable=["ner", "parser"])

In [0]:
kwic_dict = dict()
for keyword in urbanterms:
    kwic_dict[keyword] = dict()
    
    start = time.perf_counter()
    for htid, pages in htids_pages.items():
        kwic_dict[keyword][htid] = dict()
        kwic_dict[keyword][htid]["NOUNS"] = dict()
        kwic_dict[keyword][htid]["ADJS"] = dict()
        kwic_dict[keyword][htid]["VERBS"] = dict()
    
        for page_n in pages:        
            page = find_page(page_n)
            with open(f"/media/secure_volume/workset/{encode_volid(htid)}/{page}.txt") as infile:
                page_text = infile.read()
                infile.close()
                clean = cleanText(page_text) #cleaning
                doc = nlp(clean) #tokenization
                end = len(doc) - 5 + 1
                tags = [spacyTags(token) for token in doc[5:end] if token not in stoplist]
                contexts = context_extraction(tags, keyword)
                    
                for word in contexts:
                    if word[1] == "NOUN":
                        if word[0] not in kwic_dict[keyword][htid]["NOUNS"].keys():
                            kwic_dict[keyword][htid]["NOUNS"][word[0]] = 1
                        else:
                            kwic_dict[keyword][htid]["NOUNS"][word[0]] += 1

                    elif word[1] == "VERB":
                        if word[0] not in kwic_dict[keyword][htid]["VERBS"].keys():
                            kwic_dict[keyword][htid]["VERBS"][word[0]] = 1
                        else:
                            kwic_dict[keyword][htid]["VERBS"][word[0]] += 1

                    elif word[1] == "ADJ":
                        if word[0] not in kwic_dict[keyword][htid]["ADJS"].keys():
                            kwic_dict[keyword][htid]["ADJS"][word[0]] = 1
                        else:
                            kwic_dict[keyword][htid]["ADJS"][word[0]] += 1
        
    end = time.perf_counter()
    print(end-start)           

# 3 - Create csv file to store the results

This section converts the tagged novel into a neat, easy-to-read dataframe format using pandas.

In [0]:
with open(os.path.join(bigDir,"kwicdata.csv"),'w', encoding='utf-8', newline='') as kwicdata:
    writer = csv.writer(kwicdata)
    writer.writerow(("keyword", "htid", "POS", "word", "count"))
    for keyword, htids in kwic_dict.items():
        for htid, POSS in htid.items():
            for POS, words in POSS.items():
                for word, count in words.items():
                    writer.writerow((htid, POS, word, count))
