In [1]:
import lxml.etree
import os
import re
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from collections import Counter
# !pip install spacy
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
import spacy
nlp = spacy.load('en_core_web_sm')
from tqdm import tqdm


os.chdir('C:/Users/Dora Sperling/Desktop/Thesis/xml_files')

In [2]:
NSMAP = {'tei': 'http://www.tei-c.org/ns/1.0'}

def get_alltext_from_plist(plist) -> str:
    textlist = []
    for item in plist:
        if isinstance(item.text, str):
            textlist.append(item.text)
    alltext = " ".join(textlist)
    return alltext

def clean_text(
    string: str, 
    punctuation=r'''!()[]{};:-'"‘’“”\,<>./?@#$%^&*_~''') -> str:
    # Removing the punctuation
    for x in string.lower(): 
        if x in punctuation: 
            string = string.replace(x, "") 
    string = re.sub('—', ' ', string)
    string = re.sub('…', " ", string)
    # Converting the text to lower
    string = string.lower()
    string = ' '.join([word for word in string.split()])
    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()
    return string

In [4]:
def get_metadata_df(directory):
    '''
    Takes the directory containing the Pratchett full text files.
    Returns them as a dataframe with basic metadata and the full tokenized text. 
    '''
    ya_novels = ["Johnny and the Bomb", 'Only You Can Save Mankind', 'Johnny and the Dead',
                 'The Wee Free Men', 'A Hat Full of Sky', "Wintersmith", 'Nation', 'I Shall Wear Midnight',
                "The Shepherd's Crown", 'The Amazing Maurice and His Educated Rodents']
    titles = []
    ages = []
    years = []
    wordcounts = []
    young_adult = []
    texts = []
    untokenized_texts = []
    for file in tqdm(os.scandir(directory)):
        if file.name.endswith(".xml"):
            tree = lxml.etree.parse(file.name)
            root = tree.getroot()
            # Get the book's title
            title = root.find("./tei:teiHeader/tei:fileDesc/tei:sourceDesc//tei:title", 
                              namespaces=NSMAP).text
            
            # Get the year the book was published
            year_of_publication = root.find("./tei:teiHeader/tei:profileDesc/tei:particDesc//tei:age", 
                                            namespaces=NSMAP).attrib['when']
            
            # Get the age that Terry Pratchett was when the book was published 
            age_at_publication = root.find("./tei:teiHeader/tei:profileDesc/tei:particDesc//tei:age", 
                                           namespaces=NSMAP).attrib['value']
            
            # Get a list of all the paragraph elements
            plist = root.findall("./tei:text//tei:p", namespaces=NSMAP)
            text = get_alltext_from_plist(plist)
            
            # Store the untokenized text
            untokenized_texts.append(text)
            
            # Tokenize the documents
            doc = nlp(text, disable=['parser', 'tagger', 'ner', 'lemmatizer'])
            text = ' '.join([token.text for token in doc])
            
            # Get rid of all punctuation
            cleantext = clean_text(text)
            
            # Get the number of words in the document
            wordcount = len(cleantext.split(" "))
            
            # Determine whether the book is in the list of books for younger readers 
            if title in ya_novels:
                 ya=True
            else:
                 ya=False
            
            # Add all above information to lists to be made into a dataframe
            titles.append(title)
            ages.append(age_at_publication)
            years.append(year_of_publication)
            young_adult.append(ya)
            wordcounts.append(wordcount)
            texts.append(cleantext)
    
    # Paste finished lists together into dataframe
    
    df = pd.DataFrame({'title': titles, 
                   'year_of_publication':years, 
                   'age_at_publication':ages,
                    'YA':young_adult,
                   "number_of_words":wordcounts, 
                   'text':texts,
                      'untokenized_text':untokenized_texts})  
    
    # Do not include a default index column, sort by Terry Pratchett's age in ascending order
    df = df.sort_values(by='age_at_publication').reset_index(drop=True)
    return df

In [5]:
metadata_df = get_metadata_df('C:/Users/Dora Sperling/Desktop/Thesis/xml_files')
print(metadata_df.shape)
metadata_df

47it [05:50,  7.45s/it]

(45, 7)





Unnamed: 0,title,year_of_publication,age_at_publication,YA,number_of_words,text,untokenized_text
0,The Color Of Magic,1983,35.58,False,66492,in a distant and secondhand set of dimensions ...,"In A distant and secondhand set of dimensions,..."
1,The Light Fantastic,1986,38.1,False,45998,the sun rose slowly as if it was nt sure it wa...,"The sun rose slowly, as if it wasn’t sure it w..."
2,Equal Rites,1987,38.72,False,67372,this is a story about magic and where it goes ...,This is a story about magic and where it goes ...
3,Mort,1987,39.51,False,73987,this is the bright candlelit room where the li...,This is the bright candlelit room where the li...
4,Sourcery,1988,40.08,False,79536,there was a man and he had eight sons apart fr...,There was a man and he had eight sons. Apart f...
5,Wyrd Sisters,1988,40.54,False,86320,the wind howled lightning stabbed at the earth...,The wind howled. Lightning stabbed at the eart...
6,Pyramids,1989,41.13,False,88272,nothing but stars scattered across the blackne...,"Nothing but stars, scattered across the blackn..."
7,Guards! Guards!,1989,41.53,False,99811,this is where the dragons went they lie not de...,This is where the dragons went. They lie... No...
8,Eric,1990,42.3,False,34887,the bees of death are big and black they buzz ...,"The bees of Death are big and black, they buzz..."
9,Moving Pictures,1990,42.51,False,98721,watch this is space it s sometimes called the ...,Watch... This is space. It’s sometimes called ...


In [6]:
metadata_df = metadata_df[metadata_df.title != 'The World of Poo']
metadata_df.shape

(45, 7)

In [7]:
os.chdir('C:/Users/Dora Sperling/Desktop/Thesis')
metadata_df.to_csv('pratchett_metadata.csv', encoding='utf8', index=False)