In [2]:
import pandas as pd
import nltk
import spacy
import locationtagger
import json
import os
import re

# essential entity models downloads
nltk.downloader.download('maxent_ne_chunker')
nltk.downloader.download('words')
nltk.downloader.download('treebank')
nltk.downloader.download('maxent_treebank_pos_tagger')
nltk.downloader.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hanna\AppData\Roaming\nltk_data...
[nltk_data]   Package avera

True

In [3]:
df = pd.DataFrame([], columns=['book_id', 'hierarchy', 'language', 'nr_pages', 'word_count'])

path = 'data/transcripts/no_doubles/'

for filename in os.listdir(path):
    book = json.load(open(path + filename))
    data = {'book_id': filename.replace('.json', ''),
            'hierarchy': book['hierarchy'],
            'language': book['language'],
            'nr_pages': len(book['pages'])}
    
    
    # get the word count per book:
    data['word_count'] = 0
    for page in book['pages']:
        # if there are multiple transcriptions, we don't know which one to use
        if len(page) > 1:
            print('MULTIPLE TRANSCRIPTIONS FOUND: ' + filename)
            break
        else:
            p = page[0]
            for sentence in p:
                # remove all non-alphabetic characters
                data['word_count'] += len(re.sub('[^a-zA-Z ]+', '', sentence).split())

    data=pd.DataFrame(data, index=[0])
    df = pd.concat([df, data], axis=0, ignore_index=True)

In [8]:
meta_df = pd.read_csv('data/travel_df.csv', index_col =False)

# John Bell example exploration

In [55]:
# Getting all the texts by John Bell in a dataframe

john_bell_df = meta_df[meta_df['author'] == 'Bell, John']

path = 'data/transcripts/no_doubles/'

#dictionary containing the bookid and a list of text 

john_bell_texts = {}


for bookid in john_bell_df.book_id:
    filename = path + str(bookid) + '.json'

    book = json.load(open(filename))

    text = []

    for page in book['pages']:
        # if there are multiple transcriptions, we don't know which one to use
        if len(page) > 1:
            print('MULTIPLE TRANSCRIPTIONS FOUND: ' + filename)
            break
        else:
            if(page != ''):
                text += page

    john_bell_texts[bookid] = {'text': text}

In [63]:
# building a dictionary mapping page numbers to mentions of cities


for id in john_bell_texts: 
    
    for i in range(len(john_bell_texts[id]['text'])):
        # print(john_bell_texts[id])
        text = john_bell_texts[id]['text'][i]

        if(text != ''):
            # find locations per page

            place_entity = locationtagger.find_locations(text = text)

            #add cities to dictionary

            john_bell_texts[id]['cities'][i] = place_entity.cities


In [65]:
john_bell_texts[856820]['cities']

dict_values([[], [], [], ['Edinburgh', 'London', 'Bell', 'Royal', 'Strand'], [], ['Bell'], [], ['Rome', 'Roman', 'Paris', 'Italy'], ['Italy'], ['Florence', 'England'], ['Florence', 'Rome', 'Canova', 'German'], [], [], [], ['Paris', 'Fontainebleau', 'Italy'], ['Lyons', 'Paris', 'Macon', 'Fontainebleau', 'Saone'], ['Lyons', 'Macon', 'Saone'], ['Lyons', 'Savoy', 'Autun', 'Mont', 'Saone'], ['Lyons', 'Roman', 'Mont', 'Saone'], ['Lyons', 'Roman', 'Paris', 'Forum', 'Philip', 'Caracal'], ['Lyons', 'Leglise', 'Saone'], ['Lyons', 'Paris', 'Temple', 'Notre Dame'], ['Manchester', 'Lyons', 'Bridge', 'Ville'], ['Lyons', 'Clovis'], ['Lyons'], ['Lyons', 'Pierre', 'Pont', 'Saone', 'Nine'], ['Lyons', 'Roman', 'Le Pont'], ['Lyons', 'Morrow', 'Ed', 'Field'], ['Lyons'], ['Lyons', 'Bron', 'Dauphin', 'Belair'], ['Lyons'], ['Lyons', 'Roman', 'Romans'], ['Lyons', 'Leglise', 'German'], ['Lyons', 'Leglise'], ['Four', 'English'], [], ['Lyons', 'Roman', 'Eastern', 'Pont'], ['Lyons', 'Paris', 'Savoy', 'Mont', 'Saon