In [13]:
import xmltodict
import pandas as pd
from convertbng.util import convert_bng, convert_lonlat

In [14]:
""""
Parses filepath (may be string or list) to get the image filename(s).
Returns list of filenames. 
""""
def parse_repro(repro_ref):
    if type(repro_ref) is str:
        repro_ref = [repro_ref]
        
    return [i.rsplit("\\", 1)[-1] for i in repro_ref if i != None]

In [15]:
""""
Gets stratigraphy type, de-dupes and returns as comma delimited string. 
Param strat_t might be dict or list of dicts.
""""
def parse_stratigraphy_type(strat_t):
    if strat_t == None:
        return strat_t
    if isinstance(strat_t, dict): 
        return strat_t['text']['#text']
    else:
        return ", ".join(set([parse_stratigraphy_type(x) for x in strat_t]))

In [16]:
""""
Generic list/str concatenation. 
Returns semicolon-delimited str. 
""""
def parse_list(item):
    try:
        result = item if type(item) is str else "; ".join(item)
    except TypeError:
        result = item
    finally:
        return result

In [92]:
"""
Conversion of OS BNG references to northing + easting
Adapted from: https://snorfalorpagus.net/blog/2014/08/12/converting-british-national-grid-and-irish-grid-references-a-practical-example/
"""
import re

# 5x5 grid letters, missing I
alphabet = 'ABCDEFGHJKLMNOPQRSTUVWXYZ'

def grid2xy(false_easting, false_northing, gridsizes, grid_ref):
    '''Convert grid reference to coordinates'''
    # false easting and northing
    easting = -false_easting
    northing = -false_northing

    # convert letter(s) to easting and northing offset
    for n in range(0, len(gridsizes)):
        letter = grid_ref[n]
        idx = alphabet.index(letter)
        col = (idx % 5)
        row = 4 - int((idx / 5))
        easting += (col * gridsizes[n])
        
        northing += (row * gridsizes[n])

  # numeric components of grid reference
    grid_ref = grid_ref[len(gridsizes):] # remove the letters
    e = '{:0<5}'.format(grid_ref[0:int(len(grid_ref)/2)]) 
    e = '{}.{}'.format(e[0:5],e[5:])
    
    n = '{:0<5}'.format(grid_ref[int(len(grid_ref)/2):])
    n = '{}.{}'.format(n[0:5],n[5:])
    
    easting += float(e)
    northing += float(n)
    
    return int(easting), int(northing)

def british2xy(grid_ref):
    # Filters out 'unknown' or other invalid values in OS field
    valid_ref = re.compile('[A-Z]{2}[0-9]+')
    if valid_ref.match(grid_ref) is None:
        return
    else:
        false_easting = 1000000
        false_northing = 500000
        gridsizes = [500000, 100000]
        return grid2xy(false_easting, false_northing, gridsizes, grid_ref)

In [93]:
# Read in adlib export xml, parse + write to dataframe

with open('input.xml') as fd:
    doc = xmltodict.parse(fd.read())

adlib_records = doc['adlibXML']['recordList']['record']
results = []

for n in adlib_records:
    results.append({
        'priref': n['priref'][0],
        'repro_ref': parse_repro(n['reproduction.reference']),
        'object_number': n['object_number'],
        'object_name': parse_list(n.get('object_name')),
        'os_gridref': n.get('field_coll.gridref', 'unknown'),
        'strat_unit': parse_list(n.get('stratigraphy.unit', [])),
        'strat_type': parse_stratigraphy_type(n.get('stratigraphy.type')),
        'taxonomy': parse_list(n.get('taxonomy.scientific_name')),
        'description': n['description'],
        'coll_name': parse_list(n.get('field_coll.name')),
        'coll_place': parse_list(n.get('field_coll.place')),
        'coords': british2xy(n.get('field_coll.gridref', 'unknown'))
    })

df = pd.DataFrame(results)

# Split coordinate tuple into easting + northing
df[['easting', 'northing']] = df['coords'].apply(pd.Series)
df.head()

Unnamed: 0,coll_name,coll_place,coords,description,object_name,object_number,os_gridref,priref,repro_ref,strat_type,strat_unit,taxonomy,easting,northing
0,unknown,Marshwood; Marshbrook; Shropshire,"(344000, 289000)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.001,SO4489,51824,[G.00055.001.jpg],"Lithostratigraphy, Chronostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis,344000.0,289000.0
1,unknown,Marshwood; Marshbrook; Shropshire,"(344000, 289000)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.002,SO4489,655139,[G.00055.002.jpg],"Lithostratigraphy, Chronostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis,344000.0,289000.0
2,unknown,Marshbrook,"(344000, 289000)","Fossilised remains of Ordovician brachiopod, S...",Brachiopod,G.00093,SO4489,51902,[G.00093.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Longvillian...,Strophomena grandis,344000.0,289000.0
3,unknown,Cheney Longville,"(341000, 285000)",Fossilised remains of single valve of Ordovici...,Brachiopod,G.00094,SO4185,51903,[G.00094.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Aeronian St...,Kjaerina typa,341000.0,285000.0
4,unknown,Marshwood,,Fossilised remains of shell of the Ordovician ...,Brachiopod,G.00096.002,unknown,54696,[G.00096.002.jpg],"Lithostratigraphy, Chronostratigraphy",Ordovician Period; Caradoc Series; Cheney Long...,Dalmanella wattsi,,


In [19]:
# Drop rows without coordinates
copy = df[['easting', 'northing']].dropna(subset=['easting'], how='all')
copy_e = list(copy['easting'].values)
copy_n = list(copy['northing'].values)

# Convert easting/northing to lat/long
copy['long'], copy['lat'] = convert_lonlat(copy_e, copy_n)
copy.head()

Unnamed: 0,easting,northing,long,lat
0,344000.0,289000.0,-2.826285,52.496032
1,344000.0,289000.0,-2.826285,52.496032
2,344000.0,289000.0,-2.826285,52.496032
3,341000.0,285000.0,-2.869758,52.459759
10,350000.0,274000.0,-2.735666,52.361775


In [20]:
# Stitch latlong back onto the main df and add a dummy column before retrieving image urls
df['lat'] = pd.Series(copy['lat'], index=df.index)
df['long'] = pd.Series(copy['long'], index=df.index)
df['url'] = 'NaN'

In [94]:
df.head()

Unnamed: 0,coll_name,coll_place,coords,description,object_name,object_number,os_gridref,priref,repro_ref,strat_type,strat_unit,taxonomy,easting,northing
0,unknown,Marshwood; Marshbrook; Shropshire,"(344000, 289000)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.001,SO4489,51824,[G.00055.001.jpg],"Lithostratigraphy, Chronostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis,344000.0,289000.0
1,unknown,Marshwood; Marshbrook; Shropshire,"(344000, 289000)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.002,SO4489,655139,[G.00055.002.jpg],"Lithostratigraphy, Chronostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis,344000.0,289000.0
2,unknown,Marshbrook,"(344000, 289000)","Fossilised remains of Ordovician brachiopod, S...",Brachiopod,G.00093,SO4489,51902,[G.00093.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Longvillian...,Strophomena grandis,344000.0,289000.0
3,unknown,Cheney Longville,"(341000, 285000)",Fossilised remains of single valve of Ordovici...,Brachiopod,G.00094,SO4185,51903,[G.00094.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Aeronian St...,Kjaerina typa,341000.0,285000.0
4,unknown,Marshwood,,Fossilised remains of shell of the Ordovician ...,Brachiopod,G.00096.002,unknown,54696,[G.00096.002.jpg],"Lithostratigraphy, Chronostratigraphy",Ordovician Period; Caradoc Series; Cheney Long...,Dalmanella wattsi,,


In [53]:
"""
Box integration: need to set up an app with OAuth 2.0 with JWT in advance.
Current app linked to s.vincent@nhm.ac.uk: https://nhm.app.box.com/developers/console/app/569998
Before each run + every 60 mins thereafter, need to generate a new temp access token + update app.cfg
"""
# Import two classes from the boxsdk module - Client and OAuth2
from boxsdk import Client, OAuth2

# Define client ID, client secret, and developer token.
CLIENT_ID = None
CLIENT_SECRET = None
ACCESS_TOKEN = None

# Read app info from text file
with open('app.cfg', 'r') as app_cfg:
    CLIENT_ID = app_cfg.readline()
    CLIENT_SECRET = app_cfg.readline()
    ACCESS_TOKEN = app_cfg.readline()

In [54]:
from boxsdk.network.default_network import DefaultNetwork

# Create OAuth2 object. It's already authenticated, thanks to the developer token.
oauth2 = OAuth2(CLIENT_ID, CLIENT_SECRET, access_token=ACCESS_TOKEN)

# Create the authenticated client
client = Client(oauth2, DefaultNetwork())

In [55]:
# Access LudlowImages folder
folder_id = 48687719592

# Get metadata for image folder
ludlow = client.folder(folder_id).get()
limit = ludlow['item_collection']['total_count']

url_results = {}
off_set = 0

# Get id, filename and shared link for every item in folder
while off_set < limit:
    items = client.folder(folder_id).get_items(limit=500, offset=off_set)
    for f in items:
        if f['type'] != 'file':
            continue
        url_results[f['name']] = (f['id'], f['name'], client.file(file_id=f['id']).get_shared_link_download_url())
    
    off_set += 500

<Box Folder - 48687719592 (LudlowImages)>
499
999
1499
1786


In [95]:
# Get filename column and fill w dummy values
image_ref = pd.DataFrame(df['repro_ref'].copy())
image_ref['url'] = 'NaN'

# For every filename, for every row in table, attempt to match to filenames in box folder. 
for index, row in df.iterrows():
    for imagefile in row['repro_ref']:
        # If found, remove from url_results and either overwrite dummy value or concatenate (for > 1 image links)
        if imagefile in url_results:
            result = url_results.pop(imagefile)
            if image_ref.loc[index]['url'] == 'NaN':
                image_ref.loc[index]['url'] = result[2]
            else: 
                image_ref.loc[index]['url'] = f"{image_ref.loc[index]['url']}; {result[2]}"
                
image_ref.head()

Unnamed: 0,repro_ref,url
0,[G.00055.001.jpg],
1,[G.00055.002.jpg],
2,[G.00093.jpg],
3,[G.00094.jpg],
4,[G.00096.002.jpg],


In [85]:
# Get all the records with at leasrt on image and dump to csv for Portal upload
df['url'] = pd.Series(image_ref['url'], index=df.index)
df1 = df.drop(['coords', 'easting', 'northing'], axis=1).loc[lambda df: df.url != 'NaN']
df1.to_csv('ludlow_out.csv')

In [83]:
# Extract failed data to send back to ludlow

# Get all images which didn't match to a records + dump to csv
orphan_images = pd.DataFrame.from_dict(url_results, orient='index', columns=['BoxId', 'file_ref', 'BoxURL'])
orphan_images.set_index('file_ref', inplace=True)
orphan_images.to_csv('ludlow_orphan_images.csv')

# Get all records which haven't been matched to an image + dump to csv
df2 = df.drop(['coords', 'easting', 'northing'], axis=1).loc[lambda df: df.url == 'NaN']
df2.to_csv('ludlow_recs_no_image_match.csv')