# Preliminary checks on the json file

In [1]:
filepath = '../data/GeographyOmega.json'

test

## Check format of json in case of big files 

In [2]:
%%bash
## Gives the header of json file

head ../data/GeographyOmega.json

[
    {
        "book_ID": 5,
        "chaps": [
            {
                "chap_ID": 1,
                "secs": [
                    {
                        "sec_ID": 1,
                        "textstring": "",


In [3]:
%%bash
# greps for lines with N spaces at the beginning
# Good to get structure of json-file

grep -E '^ {8}"' ../data/GeographyOmega.json
#grep -E '^ {24}"' /home/malte/Dokumente/MPIWG/Geography/GeographyOmega.json
#grep -E '^ {32}"' /home/malte/Dokumente/MPIWG/Geography/GeographyOmega.json

        "book_ID": 5,
        "chaps": [
        "prev_book_ID": 4


## Import json file, access chapters, sections, and tops or tops_table

In [4]:
import json
from pprint import pprint

with open(filepath) as data_file:    
    data = json.load(data_file)

pprint(data[0]["chaps"][0]["secs"][2]["tops_table"][1])

{'ID': '5.01.02.02',
 'identical_places': [None],
 'lat_dec': 43.08,
 'lat_deg': 43,
 'lat_hemisphere': 'N',
 'lat_min': 0.08,
 'loc_lat': 40.984,
 'loc_long': 29.026,
 'loc_name': 'Kadıköy',
 'long_dec': 56.08,
 'long_deg': 56,
 'long_min': 0.08,
 'prev_sec_ID': '5.01.02.01',
 'top_type': '413 - Polis, an der Kueste gelegen',
 'toponym': '',
 'toponym_ge': 'Chalkedon*'}


# Better accessible data with pandas framework

In [5]:
# For data handling use pandas
import pandas as pd
from pandas.io.json import json_normalize

# For mathematical issues use numpy
import numpy as np

# Provides an interface to leaflet.js for drawing interactive maps
import folium
from folium import plugins
from folium.map import *

# Note: The feature group layer control needs the development versin 0.3.0 of folium
# Install by cloning into a folder:  
#      git clone https://github.com/python-visualization/folium.git
# change into the new folder folium and then 
#      pip3 install -e .
# also install new dependency
#      pip3 install branca
# Requiers a restarting of the jupyter kernel
# Check version by
#      print(folium.__version__)
# Output should be 0.3.0.dev

In [6]:
# Open json file with pandas
result = pd.DataFrame()

with open(filepath) as data_file:
    res = pd.read_json(data_file)
    for i in range(len(res["chaps"][0][0]["secs"])):
        if 'tops' in res["chaps"][0][0]["secs"][i]:
            res_tmp = json_normalize(res["chaps"][0][0]["secs"][i]["tops"])
            res_tmp['sec'] = i + 1
            result = result.append(res_tmp)
        else:
            res_tmp = json_normalize(res["chaps"][0][0]["secs"][i]["tops_table"])
            res_tmp['sec'] = i + 1
            result = result.append(res_tmp)

# rebuild dataframe with less information
result = result[['sec','loc_name','loc_lat','loc_long','toponym_ge','lat_dec','long_dec']].reset_index(drop=True)

# search for empty strings in latitude and longitude and replace with None
result = result.replace(r'', np.nan, regex=True)

# for dataframe including more information try the following
#result["loc_lat"] = result["loc_lat"].replace('',None)
#result["loc_long"] = result["loc_long"].replace('',None)

# drop all rows where latitude or longitude is None
res = result.dropna().reset_index(drop=True)

#Comment to see original coordinates
#NOTE: Meridian was different for Ptolemy (Ferro-Meridian), choose correction 27° 44′ 00″ = 27.73 dec
res['long_dec'] = res['long_dec'].apply(lambda s: s - 27.73)

#NOTE2: Seems there is a systematic error in latitude as well?! Substracting mean difference
#res['lat_dec'] = res['lat_dec'].apply(lambda s: s)

'chaps' is a nested-list, therefore 
```python
res["chaps"][0][j] 
```
gives the (j-1)-th chapter.

Every chapter contains
```python
len(res["chaps"][0][0]["secs"])```
sections, which can be accessed by their index. 

Since some sections contain tops and others tops_table, the key has to be switched accordingly by 

```python
if 'tops' in ...```

A new column with the section number is added to the dataframe by
```python
res_tmp['sec'] = i + 1```

In [7]:
#Calculating mean differences of historical and verified coordinates
lat_dif = (res['lat_dec']-res['loc_lat']).mean()
long_dif = (res['long_dec'] - res['loc_long']) .mean()
print('Mean difference in latitude:', lat_dif, ' and longitude:',long_dif)

Mean difference in latitude: 1.82242826087  and longitude: -0.0552804347826


In [8]:
#Define globally opacity and weight for lines

kw = dict(opacity=0.4, weight=2)
kwb = dict(opacity=0.4, weight=3)

# draw map centered on given coordinates and zoomed, 
#possible tiles include Openstreet Map (default), Stamen Terrain, Stamen Toner, Stamen Watercolor among other 
toponym_map = folium.Map(location=[res["loc_lat"][5],res["loc_long"][5]], zoom_start=8)
toponym_map.add_tile_layer(name='Stamen Toner', tiles='Stamen Terrain')

# Feature Groups allow in connection with LayerControl to choose, which features should be visible on map

# generate clusters for markers, which are close together, FOR VERIFIED COORDINATES
marker = FeatureGroup(name='Verified coordinates')
marker_cluster = folium.MarkerCluster().add_to(marker)
for i in range(len(res)-1):
    folium.Marker([res["loc_lat"][i], res["loc_long"][i]], 
                  popup='Sec:' + str(res["sec"][i]) + ' ' + res["toponym_ge"][i],
                  icon=folium.Icon(icon='ok')).add_to(marker_cluster)

# generate clusters for markers, which are close together, FOR HISTORICAL COORDINATES
markerHist = FeatureGroup(name='Historical coordinates')
marker_cluster_hist = folium.MarkerCluster().add_to(markerHist)
for i in range(len(res)-1):
    folium.Marker([res["lat_dec"][i], res["long_dec"][i]], 
                  popup='Sec:' + str(res["sec"][i]) + ' ' + res["toponym_ge"][i] + ': Historical',
                  icon=folium.Icon(icon='pencil',color='red')).add_to(marker_cluster_hist)

#TODO: Find way to redraw map on historical coordinates, maybe by change of tiles?!
    
# Connect all locations in one section and add feature group for everyone
secList = list(set(res['sec'].tolist()))
secList.sort()

colorDict = dict(zip(secList,('red','green','blue','orange','black')))

for i in secList:  
    tempLoc =   [(a,b) for a, b in zip(res.loc[res['sec'] == i]['lat_dec'],\
                                       res.loc[res['sec'] == i]['long_dec'])]
    tempLocs = [[tempLoc[a],tempLoc[a+1]] for a in range(len(tempLoc)-1)]
    result = folium.MultiPolyLine(locations=tempLocs, color=colorDict[i], **kwb)
    feature = FeatureGroup(name='Connect places in Sec: '+ str(i))
    feature.add_children(result)
    toponym_map.add_children(feature)

# generate feature group for displacement vectors between verified and historical coordinates
for i in secList:
    lines = []
    veri = list(zip(res.loc[res['sec'] == i]['loc_lat'],res.loc[res['sec'] == i]['loc_long']))         
    hist = list(zip(res.loc[res['sec'] == i]['lat_dec'],res.loc[res['sec'] == i]['long_dec']))
    tempLocs = [[veri[i],hist[i]] for i in range(len(veri))]
    for j in range(len(tempLocs)):
        l1 = folium.PolyLine(locations=tempLocs[j], color=colorDict[i], **kw)
        lines.append(l1)
    displacement_feature = FeatureGroup(name='Difference vectors in Sec: ' + str(i))
    for l in lines:
        displacement_feature.add_children(l)
    toponym_map.add_children(displacement_feature)


toponym_map.add_children(marker)
toponym_map.add_children(markerHist)

toponym_map.add_children(folium.map.LayerControl())

# save map as interactive html file
toponym_map.save('toponyme.html')
#draw map in jupyter
toponym_map

## Testing bokeh for plotting arrows

no maps added yet

In [10]:
import numpy as np
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
output_notebook()

N = len(res)-1
lat = np.array([res["loc_lat"][i] for i in range(len(res)-1)])
long = np.array([res["loc_long"][i] for i in range(len(res)-1)])

latHis = np.array([res["lat_dec"][i] for i in range(len(res)-1)])
longHis = np.array([res["long_dec"][i] for i in range(len(res)-1)])

radii = 0.03
sizes= 10

colors = ["#%02x%02x%02x" % (r,g,150) for r, g in zip(np.floor(50+2*lat).astype(int),np.floor(30+2*long).astype(int))]

p = figure(title="Test", x_axis_label='long', y_axis_label='lat')
p.circle(long, lat, radius=radii, fill_color='#FF0000', fill_alpha=0.6, line_color=None)
p.triangle(longHis,latHis, size=sizes, fill_color='#000000', fill_alpha=0.6, line_color=None)


for i in range(len(res)-1):
    p.line(x=[long[i],longHis[i]],y=[lat[i],latHis[i]],line_alpha=0.4)

show(p)