In [149]:
import xmltodict
import pandas as pd
from convertbng.util import convert_bng, convert_lonlat

with open('input.xml') as fd:
    doc = xmltodict.parse(fd.read())

In [150]:
def parse_repro(repro_ref):
    if type(repro_ref) is str:
        repro_ref = [repro_ref]
        
    return [i.rsplit('\\', 1)[-1] for i in repro_ref if i != None]

In [151]:
def parse_stratigraphy_type(strat_t):
    if strat_t == None:
        return strat_t
    if isinstance(strat_t, dict): 
        return strat_t['text']['#text']
    else:
        return ", ".join(set([parse_stratigraphy_type(x) for x in strat_t]))

In [152]:
def parse_list(item):
    try:
        result = item if type(item) is str else "; ".join(item)
    except TypeError:
        result = item
    finally:
        return result

In [153]:
# 5x5 grid letters, missing I
alphabet = 'ABCDEFGHJKLMNOPQRSTUVWXYZ'

def grid2xy(false_easting, false_northing, gridsizes, grid_ref):
    '''Convert grid reference to coordinates'''
    # false easting and northing
    easting = -false_easting
    northing = -false_northing

    # convert letter(s) to easting and northing offset
    for n in range(0, len(gridsizes)):
        letter = grid_ref[n]
        idx = alphabet.index(letter)
        col = (idx % 5)
        row = 4 - int((idx / 5))
        easting += (col * gridsizes[n])
        
        northing += (row * gridsizes[n])

  # numeric components of grid reference
    grid_ref = grid_ref[len(gridsizes):] # remove the letters
    e = '{:0<5}'.format(grid_ref[0:int(len(grid_ref)/2)]) 
    e = '{}.{}'.format(e[0:5],e[5:])
    
    n = '{:0<5}'.format(grid_ref[int(len(grid_ref)/2):])
    n = '{}.{}'.format(n[0:5],n[5:])
    
    easting += float(e)
    northing += float(n)
    
    return easting, northing

def british2xy(grid_ref):
    if 'S' not in grid_ref:
        return
    else:
        false_easting = 1000000
        false_northing = 500000
        gridsizes = [500000, 100000]
        return grid2xy(false_easting, false_northing, gridsizes, grid_ref)

def irish2xy(grid_ref):
    false_easting = 0
    false_northing = 0
    gridsizes = [100000]
    return grid2xy(false_easting, false_northing, gridsizes, grid_ref)

In [154]:
adlib_records = doc['adlibXML']['recordList']['record']

results = []

for n in adlib_records:
    results.append({
        'priref': n['priref'][0],
        'repro_ref': parse_repro(n['reproduction.reference']),
        'object_number': n['object_number'],
        'object_name': parse_list(n.get('object_name')),
        'os_gridref': n.get('field_coll.gridref', 'unknown'),
        'strat_unit': parse_list(n.get('stratigraphy.unit', [])),
        'strat_type': parse_stratigraphy_type(n.get('stratigraphy.type')),
        'taxonomy': parse_list(n.get('taxonomy.scientific_name')),
        'description': n['description'],
        'coll_name': parse_list(n.get('field_coll.name')),
        'coll_place': parse_list(n.get('field_coll.place')),
        'coords': british2xy(n.get('field_coll.gridref', 'unknown'))
    })

df = pd.DataFrame(results)

In [155]:
df.head()

Unnamed: 0,coll_name,coll_place,coords,description,object_name,object_number,os_gridref,priref,repro_ref,strat_type,strat_unit,taxonomy
0,unknown,Marshwood; Marshbrook; Shropshire,"(344000.0, 289000.0)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.001,SO4489,51824,[G.00055.001.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis
1,unknown,Marshwood; Marshbrook; Shropshire,"(344000.0, 289000.0)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.002,SO4489,655139,[G.00055.002.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis
2,unknown,Marshbrook,"(344000.0, 289000.0)","Fossilised remains of Ordovician brachiopod, S...",Brachiopod,G.00093,SO4489,51902,[G.00093.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Longvillian...,Strophomena grandis
3,unknown,Cheney Longville,"(341000.0, 285000.0)",Fossilised remains of single valve of Ordovici...,Brachiopod,G.00094,SO4185,51903,[G.00094.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Aeronian St...,Kjaerina typa
4,unknown,Marshwood,,Fossilised remains of shell of the Ordovician ...,Brachiopod,G.00096.002,unknown,54696,[G.00096.002.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician Period; Caradoc Series; Cheney Long...,Dalmanella wattsi


In [156]:
df[['easting', 'northing']] = df['coords'].apply(pd.Series)

In [157]:
copy = df[['easting', 'northing']].copy()
copy.dropna(subset=['easting'], how='all', inplace = True)
copy[:1]

Unnamed: 0,easting,northing
0,344000.0,289000.0


In [163]:
copy_e = list(copy['easting'].values)
copy_n = list(copy['northing'].values)

copy['long'], copy['lat'] = convert_lonlat(copy_e, copy_n)

In [169]:
print(type(copy))
df['lat'] = pd.Series(copy['lat'], index=df.index)
df['long'] = pd.Series(copy['long'], index=df.index)

<class 'pandas.core.frame.DataFrame'>


In [170]:
df

Unnamed: 0,coll_name,coll_place,coords,description,object_name,object_number,os_gridref,priref,repro_ref,strat_type,strat_unit,taxonomy,easting,northing,lat,long
0,unknown,Marshwood; Marshbrook; Shropshire,"(344000.0, 289000.0)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.001,SO4489,51824,[G.00055.001.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis,344000.0,289000.0,52.496032,-2.826285
1,unknown,Marshwood; Marshbrook; Shropshire,"(344000.0, 289000.0)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.002,SO4489,655139,[G.00055.002.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis,344000.0,289000.0,52.496032,-2.826285
2,unknown,Marshbrook,"(344000.0, 289000.0)","Fossilised remains of Ordovician brachiopod, S...",Brachiopod,G.00093,SO4489,51902,[G.00093.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Longvillian...,Strophomena grandis,344000.0,289000.0,52.496032,-2.826285
3,unknown,Cheney Longville,"(341000.0, 285000.0)",Fossilised remains of single valve of Ordovici...,Brachiopod,G.00094,SO4185,51903,[G.00094.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Aeronian St...,Kjaerina typa,341000.0,285000.0,52.459759,-2.869758
4,unknown,Marshwood,,Fossilised remains of shell of the Ordovician ...,Brachiopod,G.00096.002,unknown,54696,[G.00096.002.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician Period; Caradoc Series; Cheney Long...,Dalmanella wattsi,,,,
5,unknown,Meadowtown,,Fossilised remains of the Ordovician graptolit...,Graptolite,G.00106,Unknown,51907,[G.00106.jpg],"Chronostratigraphy, Biostratigraphy",Ordovician,Diplograptus foliaceus,,,,
6,unknown,Unknown,,"Fossilised remains of the Silurian tabulate, c...",CORAL,G.00113,unknown,51917,"[G.00113a.jpg, G.00113b.jpg]","Chronostratigraphy, Lithostratigraphy",Silurian Period; Much Wenlock Series; Much Wen...,Heliolites interstinctus,,,,
7,unknown,Unknown,,Fossilised remains of the Silurian rugose cora...,CORAL,G.00118,Unknown,50807,"[G.00118a.jpg, G.00118b.jpg, G.00118c.jpg]","Chronostratigraphy, Lithostratigraphy",Silurian Period; Wenlock Series; Much Wenlock ...,Kodonophyllum truncatum,,,,
8,"Lloyd, Dr. Thomas",Unknown,,"Fossilised remains of Silurian coral, Ketophyl...",CORAL,G.00123,unknown,51925,"[G.00123c.jpg, G.00123a.jpg, G.00123b.jpg]","Chronostratigraphy, Lithostratigraphy",Silurian Period; Wenlock Series; Much Wenlock ...,Ketophyllum turbinatum,,,,
9,unknown,Unknown,,Fossilised shell remains of the Silurian brach...,Brachiopod,G.00133,unknown,51963,[G.00133.jpg],"Chronostratigraphy, Biostratigraphy",Silurian Period; Ludlow,Atrypa reticularis,,,,
