In [1]:
from bs4 import BeautifulSoup
import spacy
# check out spaCy documentation here: https://spacy.io/usage/linguistic-features

In [2]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("Mohammed, Sultan of Carizme, reigned in Sogdiana, when it was invaded (A.D. 1218) by Zingis and his Moguls.")
# doc = nlp("Mohammed, Sultan of Carizme, reigned in Sogdiana, when it was invaded (A.D. 1218) by Zingis and his Mongols.")

In [3]:
# document level
ents = [(e.text, e.label_) for e in doc.ents]
print(ents)


# for more on label conventions for en_core_web_md, see: https://spacy.io/models/en#en_core_web_sm

[('Mohammed', 'PERSON'), ('Sultan', 'PERSON'), ('Carizme', 'PERSON'), ('Sogdiana', 'GPE'), ('A.D. 1218', 'DATE'), ('Zingis', 'PERSON'), ('Moguls', 'NORP')]


In [4]:
with open('gibbonfortm.xml') as gibbon_fh:
  soup = BeautifulSoup(gibbon_fh, 'lxml')

FileNotFoundError: [Errno 2] No such file or directory: 'gibbonfortm.xml'

In [None]:

div_list = soup.findAll('div')


chapter_list = []
for div in div_list:
  div_type = div.get('subtype')
  if div_type == 'chapter':
    chapter_list.append(div)


In [None]:
chapter_number = ### YOUR CODE HERE ###

for i in range(len(chapter_list)):
  chap_num = chapter_list[i].get('n')
  if chap_num == chapter_number:
    text = chapter_list[i].get_text()

In [None]:
text[:500]

In [None]:
full_text = text.replace('\n', ' ')

In [None]:
full_text[:500]

In [None]:
# Create a new string called text_seg that only contains the first 10000 characters of the chapter you are working with

### YOUR CODE HERE ###


In [None]:
# Run an NLP analysis on the text segment, generate a list of the named entities in the text, and print out the list of named entities

### YOUR CODE HERE ###

In [None]:
# Populate the empty list below with only the named entities classed as 'GPE' or 'LOC'


places = []

### YOUR CODE HERE ###

In [None]:
places

In [None]:
place_names_duplicates = [x[0] for x in places]

print(len(place_names_duplicates))

In [None]:
place_names_duplicates

In [None]:
# Populate the empty list below with the identified place names but remove any duplicates so each name occurs only once

place_names = []

### YOUR CODE HERE ###

In [None]:
place_counts = []

for place in place_names:
  place_counts.append(place_names_duplicates.count(place))


In [None]:
# sanity check!
print(len(place_names), len(place_counts))

In [None]:
counts_dict = dict(zip(place_names, place_counts))
# counts_dict


### Pause for discussion!

Why would we want to include the counts of mentions of places for mapping purposes?

In your groups, identify another feature you might want to include on the map. Sketch out a process for extracting that information and including it as an additional column in the .csv file you use to generate the map. This can be in general terms or using some pseudo-code. Be prepared to share!

In [None]:
import requests
import time

def search_peripleo(name, fuzzy=False, datasets=('pleiades'), from_date=-3000, to_date=2000, retry_attempts=10):
    """
    get the raw response back from peripleo as a dictionary. look here for more details: https://github.com/pelagios/peripleo/blob/main/README.md
    :param name: place name to search
    :param fuzzy: whether a fuzzy search should be performed
    :param datasets: which datasets should be included in the search
    :param from_date: start date for search
    :param to_date: end date for search
    :param retry_attempts: how many times to retry the request if it fails
    :return:
    """
    # pause execution to prevent dos'ing the GeoNames server
    time.sleep(0.3)

    # default parameters
    params = {
        'query': name,
        'types': 'place',
        'from': from_date,
        'to': to_date,
        'datasets': datasets
    }

    if fuzzy:
        params['query'] = params['query'] + '~'

    gz_url = 'http://peripleo.pelagios.org/peripleo/search'  # baseurl for peripleo search

    try:
        response = requests.get(gz_url, params=params, timeout=None)
    except requests.exceptions.Timeout as e:
        # if specified in the arguments, retry the API call on request timeout
        print(e)
        if retry_attempts > 0:
            retry_attempts -= 1
            return search_name(name, retry_attempts=retry_attempts)
        else:
            raise Exception('Timeout after specified retries.')

    if response.status_code == 200:
        return response.json()
    
    # for any response where the status code is not 200 ('success') retry the API call
    if retry_attempts > 0:
        retry_attempts -= 1
        return search_name(name, retry_attempts=retry_attempts)
    else:
        raise Exception('Status code: ' + str(response.status_code))

In [None]:
# match all places nouns against the gazetteer
peripleo_results = []
num_places = len(place_names)
prev_disp_percent = 0
for i in range(len(place_names)):
    place = place_names[i]
    print(f'matching {place} against peripleo (Pleiades) gazetteer...')
    # search has default time bounds and default dataset is Pleiades. look at the code for details
    try:
        peripleo_results.append({'token': place, 'results': search_peripleo(place, fuzzy=False)})
    except NameError:
        print('NameError', place)
        continue
    percent_done = (i / num_places) * 100
    disp_percent = percent_done // 1
    if disp_percent > prev_disp_percent:
        print('\n' + str(disp_percent) + '% of identified places have been checked against gazetteer...\n')
        prev_disp_percent += 1
    
print('complete!')


In [None]:
len(peripleo_results)

In [None]:
# simple georesolution where the first result from the gazetteer is taken as the value
peripleo_places = [{'token': i['token'], 'place': i['results']['items'][0]} for i in peripleo_results if len(i['results']['items']) > 0]
peripleo_places[:5]

In [None]:
import pandas as pd


tokens = []
names = []
longs = []
lats = []
identifiers = []
counts = []


for p in peripleo_places:
    token = p['token']
    name_s = p['place']['names']
#     average the minimum and maximum longitudes and latitutdes
    try:
        long = ((p['place']['geo_bounds']['max_lon']) + (p['place']['geo_bounds']['min_lon'])) / 2
        lat = ((p['place']['geo_bounds']['max_lat']) + (p['place']['geo_bounds']['min_lat'])) / 2
        count = counts_dict[token]
    except KeyError:
        long = '?'
        lat = '?'
    identifier = p['place']['identifier']
    tokens.append(token)
    names.append(name_s)
    longs.append(long)
    lats.append(lat)
    identifiers.append(identifier)
    counts.append(count)
    

In [None]:
place_df = pd.DataFrame({'token': tokens, 'names': names, "latitude": lats, 'longitude': longs, "identifier": identifiers, "counts": counts})

In [None]:
# export pandas dataframe to csv file
place_df.to_csv(r'gibb_places.csv')

# Now bring this into GoogleMaps to see if it worked!

### Pause for discussion!

Why might some of the places you expect to appear on the map be missing?

In your groups, experiment with different gazetteers and/or different time frames in the function `search_peripleo()`.

Try mapping your new data. What changed? Why do you think you did or did not get different results? Share with the class!