In [259]:
import xmltodict
import pandas as pd
from convertbng.util import convert_bng, convert_lonlat

with open('input.xml') as fd:
    doc = xmltodict.parse(fd.read())

In [260]:
def parse_repro(repro_ref):
    if type(repro_ref) is str:
        repro_ref = [repro_ref]
        
    return [i.rsplit("\\", 1)[-1] for i in repro_ref if i != None]

In [261]:
def parse_stratigraphy_type(strat_t):
    if strat_t == None:
        return strat_t
    if isinstance(strat_t, dict): 
        return strat_t['text']['#text']
    else:
        return ", ".join(set([parse_stratigraphy_type(x) for x in strat_t]))

In [262]:
def parse_list(item):
    try:
        result = item if type(item) is str else "; ".join(item)
    except TypeError:
        result = item
    finally:
        return result

In [263]:
# 5x5 grid letters, missing I
alphabet = 'ABCDEFGHJKLMNOPQRSTUVWXYZ'

def grid2xy(false_easting, false_northing, gridsizes, grid_ref):
    '''Convert grid reference to coordinates'''
    # false easting and northing
    easting = -false_easting
    northing = -false_northing

    # convert letter(s) to easting and northing offset
    for n in range(0, len(gridsizes)):
        letter = grid_ref[n]
        idx = alphabet.index(letter)
        col = (idx % 5)
        row = 4 - int((idx / 5))
        easting += (col * gridsizes[n])
        
        northing += (row * gridsizes[n])

  # numeric components of grid reference
    grid_ref = grid_ref[len(gridsizes):] # remove the letters
    e = '{:0<5}'.format(grid_ref[0:int(len(grid_ref)/2)]) 
    e = '{}.{}'.format(e[0:5],e[5:])
    
    n = '{:0<5}'.format(grid_ref[int(len(grid_ref)/2):])
    n = '{}.{}'.format(n[0:5],n[5:])
    
    easting += float(e)
    northing += float(n)
    
    return int(easting), int(northing)

def british2xy(grid_ref):
    # Todo - update this so it works on a regex instead of string value
    if 'S' not in grid_ref:
        return
    else:
        false_easting = 1000000
        false_northing = 500000
        gridsizes = [500000, 100000]
        return grid2xy(false_easting, false_northing, gridsizes, grid_ref)

def irish2xy(grid_ref):
    false_easting = 0
    false_northing = 0
    gridsizes = [100000]
    return grid2xy(false_easting, false_northing, gridsizes, grid_ref)

In [264]:
adlib_records = doc['adlibXML']['recordList']['record']

results = []

for n in adlib_records:
    results.append({
        'priref': n['priref'][0],
        'repro_ref': parse_repro(n['reproduction.reference']),
        'object_number': n['object_number'],
        'object_name': parse_list(n.get('object_name')),
        'os_gridref': n.get('field_coll.gridref', 'unknown'),
        'strat_unit': parse_list(n.get('stratigraphy.unit', [])),
        'strat_type': parse_stratigraphy_type(n.get('stratigraphy.type')),
        'taxonomy': parse_list(n.get('taxonomy.scientific_name')),
        'description': n['description'],
        'coll_name': parse_list(n.get('field_coll.name')),
        'coll_place': parse_list(n.get('field_coll.place')),
        'coords': british2xy(n.get('field_coll.gridref', 'unknown'))
    })

df = pd.DataFrame(results)
df[['easting', 'northing']] = df['coords'].apply(pd.Series)
df

for index, row in df.iterrows():
    print(row['repro_ref'])

['G.00055.001.jpg']
['G.00055.002.jpg']
['G.00093.jpg']
['G.00094.jpg']
['G.00096.002.jpg']
['G.00106.jpg']
['G.00113a.jpg', 'G.00113b.jpg']
['G.00118a.jpg', 'G.00118b.jpg', 'G.00118c.jpg']
['G.00123c.jpg', 'G.00123a.jpg', 'G.00123b.jpg']
['G.00133.jpg']
['G.00145.jpg']
['G.00147.jpg']
['G.00148.jpg']
['G.00151.jpg']
['G.00160.jpg']
['G.00182.jpg']
['G.00183.jpg']
['G.00184.jpg']
['G.00185.jpg']
['G.00186.jpg']
['G.00187.jpg']
['G.00188.jpg']
['G.00191a.jpg', 'G.00191b.jpg']
['G.00192.001.jpg']
['G.00194.jpg']
['G.00196.jpg']
['G.00197.jpg']
['G.00202.jpg']
['G.00214.jpg']
['G.00269.jpg']
['G.00272.jpg']
['G.00273.jpg']
['G.00309.jpg']
['G.00317.jpg']
['G.00380.jpg']
['G.00386.jpg']
['G.00387.jpg']
['G.00400.001b.jpg', 'G.00400.001a.jpg']
['G.00400.002b.jpg', 'G.00400.002a.jpg']
['G.00408.002.jpg']
['G.00443.jpg']
['G.00468.jpg']
['G.00471.jpg']
['G.00483.jpg']
['G.00489.jpg']
['G.00491.jpg']
['G.00496.jpg']
['G.00566.jpg']
['G.00604.jpg']
['G.00841.jpg']
['G.00922.jpg']
['G.01032.jpg'

['G.12008.jpg']
['G.12011.jpg']
['G.12014b.jpg', 'G.12014a.jpg']
['G.12015.jpg']
['G.12017b.jpg', 'G.12017c.jpg']
['G.12020.JPG']
['G.12026.jpg']
['G.12027.jpg']
['G.12028.jpg']
['G.12030.001c.jpg', 'G.12030.001d.jpg', 'G.12030.001a.jpg', 'G.12030.001b.jpg']
['G.12030.002.jpg']
['G.12036a.jpg', 'G.12036b.jpg']
['G.12038.jpg']
['G.12040.jpg']
['G.12042a.jpg', 'G.12042b.jpg']
['G.12044a.jpg', 'G.12044b.jpg']
['G.12046.001.jpg']
['G.12046.002.jpg']
['G.12047a.jpg', 'G.12047b.jpg']
['G.12051.jpg']
['G.12086.001.jpg', 'G.12086.001b.jpg']
['G.12086.002a.jpg', 'G.12086.002b.jpg']
['G.12091.001a.jpg', 'G.12091.001b.jpg']
['G.12091.002a.jpg', 'G.12091.002b.jpg']
['G.12091.003a.jpg', 'G.12091.003b.jpg']
['G.12091.004a.jpg', 'G.12091.004b.jpg']
['G.12091.005a.jpg', 'G.12091.005b.jpg']
['G.12091.006a.jpg', 'G.12091.006b.jpg']
['G.12091.007a.jpg', 'G.12091.007b.jpg']
['G.12091.008a.jpg', 'G.12091.008b.jpg']
['G.12096.001.jpg']
['G.12096.002.jpg']
['G.12103.001a.jpg']
['G.12103.002.jpg']
['G.12121.0

In [265]:
copy = df[['easting', 'northing']].dropna(subset=['easting'], how='all')
copy_e = list(copy['easting'].values)
copy_n = list(copy['northing'].values)
copy['long'], copy['lat'] = convert_lonlat(copy_e, copy_n)
copy.head()

Unnamed: 0,easting,northing,long,lat
0,344000.0,289000.0,-2.826285,52.496032
1,344000.0,289000.0,-2.826285,52.496032
2,344000.0,289000.0,-2.826285,52.496032
3,341000.0,285000.0,-2.869758,52.459759
10,350000.0,274000.0,-2.735666,52.361775


In [266]:
copy.loc[10]['long']

-2.7356662900000002

In [267]:
df['lat'] = pd.Series(copy['lat'], index=df.index)
df['long'] = pd.Series(copy['long'], index=df.index)
df['url'] = 'NaN'

In [268]:
df

Unnamed: 0,coll_name,coll_place,coords,description,object_name,object_number,os_gridref,priref,repro_ref,strat_type,strat_unit,taxonomy,easting,northing,lat,long,url
0,unknown,Marshwood; Marshbrook; Shropshire,"(344000, 289000)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.001,SO4489,51824,[G.00055.001.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis,344000.0,289000.0,52.496032,-2.826285,
1,unknown,Marshwood; Marshbrook; Shropshire,"(344000, 289000)",Fossilised shell remains of the Ordovician bra...,Brachiopod,G.00055.002,SO4489,655139,[G.00055.002.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician; Caradoc Series; Marshbrookian Stag...,Dalmanella unguis,344000.0,289000.0,52.496032,-2.826285,
2,unknown,Marshbrook,"(344000, 289000)","Fossilised remains of Ordovician brachiopod, S...",Brachiopod,G.00093,SO4489,51902,[G.00093.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Longvillian...,Strophomena grandis,344000.0,289000.0,52.496032,-2.826285,
3,unknown,Cheney Longville,"(341000, 285000)",Fossilised remains of single valve of Ordovici...,Brachiopod,G.00094,SO4185,51903,[G.00094.jpg],Chronostratigraphy,Ordovician Period; Caradoc Series; Aeronian St...,Kjaerina typa,341000.0,285000.0,52.459759,-2.869758,
4,unknown,Marshwood,,Fossilised remains of shell of the Ordovician ...,Brachiopod,G.00096.002,unknown,54696,[G.00096.002.jpg],"Chronostratigraphy, Lithostratigraphy",Ordovician Period; Caradoc Series; Cheney Long...,Dalmanella wattsi,,,,,
5,unknown,Meadowtown,,Fossilised remains of the Ordovician graptolit...,Graptolite,G.00106,Unknown,51907,[G.00106.jpg],"Chronostratigraphy, Biostratigraphy",Ordovician,Diplograptus foliaceus,,,,,
6,unknown,Unknown,,"Fossilised remains of the Silurian tabulate, c...",CORAL,G.00113,unknown,51917,"[G.00113a.jpg, G.00113b.jpg]","Chronostratigraphy, Lithostratigraphy",Silurian Period; Much Wenlock Series; Much Wen...,Heliolites interstinctus,,,,,
7,unknown,Unknown,,Fossilised remains of the Silurian rugose cora...,CORAL,G.00118,Unknown,50807,"[G.00118a.jpg, G.00118b.jpg, G.00118c.jpg]","Chronostratigraphy, Lithostratigraphy",Silurian Period; Wenlock Series; Much Wenlock ...,Kodonophyllum truncatum,,,,,
8,"Lloyd, Dr. Thomas",Unknown,,"Fossilised remains of Silurian coral, Ketophyl...",CORAL,G.00123,unknown,51925,"[G.00123c.jpg, G.00123a.jpg, G.00123b.jpg]","Chronostratigraphy, Lithostratigraphy",Silurian Period; Wenlock Series; Much Wenlock ...,Ketophyllum turbinatum,,,,,
9,unknown,Unknown,,Fossilised shell remains of the Silurian brach...,Brachiopod,G.00133,unknown,51963,[G.00133.jpg],"Chronostratigraphy, Biostratigraphy",Silurian Period; Ludlow,Atrypa reticularis,,,,,


In [269]:
# Import two classes from the boxsdk module - Client and OAuth2
from boxsdk import Client, OAuth2

# Define client ID, client secret, and developer token.
CLIENT_ID = None
CLIENT_SECRET = None
ACCESS_TOKEN = None

# Read app info from text file
with open('app.cfg', 'r') as app_cfg:
    CLIENT_ID = app_cfg.readline()
    CLIENT_SECRET = app_cfg.readline()
    ACCESS_TOKEN = app_cfg.readline()
    
# see tutorial at http://opensource.box.com/box-python-sdk/tutorials/intro.html

In [270]:
from boxsdk.network.default_network import DefaultNetwork

# Create OAuth2 object. It's already authenticated, thanks to the developer token.
oauth2 = OAuth2(CLIENT_ID, CLIENT_SECRET, access_token=ACCESS_TOKEN)

# Create the authenticated client
client = Client(oauth2, DefaultNetwork())

In [271]:
# Access LudlowImages folder
ludlow = client.folder(48687719592).get()
limit = ludlow['item_collection']['total_count']

print(limit)

url_results = {}
off_set = 0

while off_set < limit:
    items = client.folder(res[0]['id']).get_items(limit=500, offset=off_set)
    
    for f in items:
        if f['type'] != 'file':
            continue

        url_results[f['name']] = (f['id'], f['name'], client.file(file_id=f['id']).get_shared_link_download_url())
        print(f['name'])
    
    off_set += 500
    print(len(url_results))
    


1787
G.00055.001.jpg
G.00055.002.jpg
G.00093.jpg
G.00094.jpg
G.00096.002.jpg
G.00106.jpg
G.00113a.jpg
G.00113b.jpg
G.00118a.jpg
G.00123a.jpg
G.00123b.jpg
G.00123c.jpg
G.00133.jpg
G.00145.jpg
G.00147.jpg
G.00148.jpg
G.00151.jpg
G.00160.jpg
G.00191a.jpg
G.00191b.jpg
G.00192.001.jpg
G.00194.jpg
G.00196.jpg
G.00197.jpg
G.00202.jpg
G.00214.jpg
G.00269.jpg
G.00272.jpg
G.00273.jpg
G.00309.jpg
G.00317.jpg
G.00380.jpg
G.00386.jpg
G.00387.jpg
G.00400.001a.jpg
G.00400.001b.jpg
G.00400.002a.jpg
G.00400.002b.jpg
G.00408.002.jpg
G.00436.jpg
G.00443.jpg
G.00468.jpg
G.00471.jpg
G.00483.jpg
G.00489.jpg
G.00491.jpg
G.00496.jpg
G.00566.jpg
G.00604.jpg
G.00841.jpg
G.00922.jpg
G.01036.jpg
G.01037.jpg
G.01087.jpg
G.01123.jpg
G.01134.jpg
G.01136.jpg
G.01379a.jpg
G.01379b.jpg
G.01386.jpg
G.01396.jpg
G.01455.jpg
G.01465.jpg
G.01587.jpg
G.01614.jpg
G.01616.jpg
G.01617.jpg
G.01624.jpg
G.01627.jpg
G.01651.jpg
G.01652.jpg
G.01746.jpg
G.01750.jpg
G.01787.002.jpg
G.01790.jpg
G.01790b.jpg
G.01857.jpg
G.01864.jpg
G.01

G.07522.jpg
G.07543.001a.jpg
G.07543a.jpg
G.07543b.jpg
G.07544.jpg
G.07545.jpg
G.07547a.jpg
G.07547b.jpg
G.07551a.jpg
G.07551b.jpg
G.07620.001.jpg
G.07620.002.jpg
G.07620.002b.jpg
G.07620.003.jpg
G.07620.003b.jpg
G.07642.jpg
G.07646a.jpg
G.07646b.jpg
G.07721.jpg
G.07727.jpg
G.07728.jpg
G.07819.002a.jpg
G.07819.002b.jpg
G.07819a.jpg
G.07819b.jpg
G.07825.jpg
G.07827.001.jpg
G.07827.002.jpg
G.07838a.jpg
G.07838b.jpg
G.07843.jpg
G.07852.jpg
G.07860.jpg
G.07861.jpg
G.07866a.jpg
G.07866b.jpg
G.07877.002.jpg
G.07884.jpg
G.07898.001.jpg
G.07905.001.jpg
G.07906.001.jpg
G.07906.002.jpg
G.07906.jpg
G.07921.002.jpg
G.07921.002b.jpg
G.07930.jpg
G.07931.jpg
G.07932.jpg
G.07946.jpg
G.07951.jpg
G.07963.jpg
G.07970.jpg
G.08030.001.jpg
G.08036.002a.jpg
G.08036.002b.jpg
G.08040.jpg
G.08057.jpg
G.08072.jpg
G.08073b.jpg
G.08074a.jpg
G.08074b.jpg
G.08084a.jpg
G.08084b.jpg
G.08087.jpg
G.08127.001.jpg
G.08127.002.jpg
G.08128a.jpg
G.08128b.jpg
G.08129.001.jpg
G.08129.002.jpg
G.08133.jpg
G.08133a.jpg
G.08136.00

G.13664a.jpg
G.13664b.jpg
G.13664c.jpg
G.13675.jpg
G.13683.jpg
G.13685.jpg
G.13697.jpg
G.13715.jpg
G.13716.jpg
G.13717.jpg
G.13721.jpg
G.13725a.jpg
G.13725b.jpg
G.13725c.jpg
G.13725d.jpg
G.13726.jpg
G.13727.jpg
G.13728.001.jpg
G.13728.002.jpg
G.13732a.jpg
G.13735.jpg
G.13736a.jpg
G.13736b.jpg
G.13737.jpg
G.13738.jpg
G.13767.jpg
G.13786.jpg
G.13793.jpg
G.13799.jpg
G.13801.jpg
G.13812.jpg
G.13815.jpg
G.13817.jpg
G.13820.jpg
G.13828.jpg
G.13830.jpg
G.13832.jpg
G.13837.jpg
G.13847.jpg
G.13852.jpg
G.13858.jpg
G.13859.jpg
G.13861.jpg
G.13864.jpg
G.13868.jpg
G.13869a.jpg
G.13869b.jpg
G.13871.jpg
G.13872.jpg
G.13887.jpg
G.13899.jpg
G.13928.jpg
G.13939.jpg
G.14016.jpg
G.14028a.jpg
G.14048a.jpg
G.14048b.jpg
G.14095.jpg
G.14096.jpg
G.14101.jpg
G.14102.jpg
G.14134.jpg
G.14135.jpg
G.14136a.jpg
G.14146.jpg
G.14166.jpg
G.14188.jpg
G.14190.jpg
G.14192.jpg
G.14202.jpg
G.14207.jpg
G.14213a.jpg
G.14213b.jpg
G.14214.jpg
G.14215.jpg
G.14216.jpg
G.14223.jpg
G.14232.jpg
G.14245b.jpg
G.14260.jpg
G.14261.jpg
G

In [272]:
print(len(url_results))

from numpy import nan
image_ref = pd.DataFrame(df['repro_ref'].copy())
image_ref['url'] = 'NaN'

for index, row in df.iterrows():
    for imagefile in row['repro_ref']:
        if imagefile in url_results:
            if image_ref.loc[index]['url'] == 'NaN':
                image_ref.loc[index]['url'] = url_results[imagefile][2]
            else: 
                image_ref.loc[index]['url'] = f"{image_ref.loc[index]['url']}; {url_results[imagefile][2]}" 


1786


In [273]:
df['url'] = pd.Series(image_ref['url'], index=df.index)

df.to_csv('ludlow_out.csv')