# Automatic extraction from pdf

This tool uses regular expressions to extract soil science information from pdf. Note that this tool is not able to extract information from pdf tables, only from text.

This tool is experimental.

In [4]:
import re
#import spacy
import ipywidgets as widgets
import os
import pandas as pd
import numpy as np
import tempfile
import json
import time
from ipyleaflet import Map, Marker
import base64
from IPython.display import HTML

pd.set_option('display.max_colwidth', 0) # and then each column will be just as big as it needs to be

datadir = '../data/kunsat/'

In [5]:
# get text from pdf
def pdf2text(fo):
    text = ''
    with tempfile.TemporaryDirectory() as td:
        fname = os.path.join(td, 'fname.pdf')
        with open(fname, 'wb') as f:
            f.write(fo)
        os.system('pdftotext ' + fname)
        fpath = fname[:-4] + '.txt'
        if os.path.exists(fpath):
            with open(fpath, 'r', encoding='utf8') as f:
                try:
                    text = f.read()
                    if len(text) < 100:
                        print('pdf2text: document too small')
                except:
                    print('pdf2text: error with')
    return text

# test
# with open('../data/from-kunsat/papers/alagna2016.pdf', 'rb') as f:
#     fo = f.read()
# text = pdf2text(fo)
# len(text)

In [6]:
# extract abstract
def getAbstract(doc):
    # detect abstract, keywords, reference, title - overwrite rawdocs!
    abstract = ''
    body = ''
    code = ''  # determine how the abstract was found
    splitdoc = re.split('a b s t r a c t|abstract|ABSTRACT|Abstract|A B S T R A C T|Summary|s u m m a r y|SUMMARY', doc)
    foundStart = True
    if len(splitdoc) == 1:
        foundStart = False
    text = '\n'.join(splitdoc[1:]) if len(splitdoc) > 1 else splitdoc[0] # if not keyword abstract found, take all

    # assuming first paragraph is the abstract
    splitEnd = re.split('Published|Introduction|Copyright|©|Keywords|keywords|KEYWORDS|KEY WORDS|Citation', text)
    foundEnd = True
    if len(splitEnd) == 1:
        foundEnd = False
        #print('abstract end not found', fname)
        paragraphs = text.split('\n\n')
        ps = []
        a = ''
        # we only want paragraphs split that are more than 10 characters
        for p in paragraphs:
            a = a + '\n\n' + p
            if len(a) > 10:
                ps.append(a)
                a = ''
        if foundStart is True:
            abstract = ps[0]
            code = 'ps[0]'
        else:
            # without detection of the start or end, we
            # blindly assume that abstract is first paragraph
            abstract = ps[1]
            code = 'first'
            print('abstract = first paragraph', fname)
    else:
        abstract = ''
        for j, s in enumerate(splitEnd):
            if len(s) > 50 and abstract == '':
                abstract = s
                code = 's{:d}'.format(j)

    # cleaning up the abstract
    if abstract[0] in [':', '.']:
        abstract = abstract[1:]
    abstract = abstract.strip()

    # edge case (if we have two first large paragraphs)
    ts = abstract.split('\n\n')
    if len(ts) > 1:
        if (len(ts[0]) > 800) & (len(ts[1]) > 800):
            if ts[0][0] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
                abstract = ts[0]
                #print(fname, 'first')
                code = 'ts[0]'
            else:
                abstract = ts[1]
                #print(fname, 'second')
                code = 'ts[1]'

    # remove the abstract from the body
    body = text.replace(abstract, '')

    # remove the references
    #if len(re.findall('r(é|e)f(é|e)rences?|r ?e ?f ?e ?r ?e ?n ?c ?e ?s?', body, flags=re.IGNORECASE)) == 0:
    #    print('no ref found for', fname)
    parts = re.split('\n\s?r(é|e)f(é|e)rences?\n|\n\s?r ?e ?f ?e ?r ?e ?n ?c ?e ?s?\n', body, flags=re.IGNORECASE)
    if len(parts) > 2:  # at least one 'reference' found
        body = '\nReferences\n'.join([a for a in parts[:-1] if a is not None])
    else:
        body = body
        #print('ref not found for', dfpub.loc[i, 'fname'])
        # for old papers with no ref section parsed to OCR, that's often the case
    # failsafe for badly detected abstract
    ratio = len(body) / len(text)
    if ratio < 0.6:
        print('getAbstract: abstract probably not well detected for', fname, '(ratio: {:.2f})'.format(ratio))
        body = text
    return body, abstract

# test
# doc, abstract = getAbstract(text)
# len(doc), len(abstract)

In [7]:
# extract soil type

# load soils from keyword trees
with open(datadir + '../keyword-trees/SoilTypeFAO.json', 'r') as f:
    soilTypeFAO = json.load(f)
with open(datadir + '../keyword-trees/SoilTypeQualifiersFAO.json', 'r') as f:
    soilTypeQualifiersFAO = json.load(f)
with open(datadir + '../keyword-trees/SoilTypeGER.json', 'r') as f:
    soilTypeGER = json.load(f)
with open(datadir + '../keyword-trees/SoilTypeWRB.json', 'r') as f:
    soilTypeWRB = json.load(f)
with open(datadir + '../keyword-trees/SoilTypeQualifiersWRB.json', 'r') as f:
    soilTypeQualifiersWRB = json.load(f)
with open(datadir + '../keyword-trees/SoilTypeSpecifiersWRB.json', 'r') as f:
    soilTypeSpecifiersWRB = json.load(f)
with open(datadir + '../keyword-trees/SoilTypeUSDA.json', 'r') as f:
    soilTypeUSDA = json.load(f)
with open(datadir + '../keyword-trees/SoilTypeQualifiersUSDA.json', 'r') as f:
    soilTypeQualifiersUSDA = json.load(f)

soilvoct = [soilTypeFAO, soilTypeWRB, soilTypeUSDA] # soilTypeGER,
soilvocq = [soilTypeQualifiersFAO, soilTypeQualifiersWRB,
            soilTypeSpecifiersWRB, soilTypeQualifiersUSDA]
soilvoc = []
for a in soilvoct:
    for b in a['children']:
        soilvoc.append(b['value'])
        if 'children' in b.keys():
            for c in b['children']:
                soilvoc.append(c['value'])
                if 'children' in c.keys():
                    for d in c['children']:
                        soilvoc.append(d['value'])
soilvoc = [a.lower() for a in soilvoc]
soilvoc += [a[:-1] for a in soilvoc if a[-1] == 's']
soilvoc.remove('arent')
soilvoc.remove('arent')
soilvoc.remove('arents')

# FAO pattern
qualifiers = [dic['value'].strip() for dic in soilTypeQualifiersFAO['children']]
soiltypes = [dic['value'].strip() for dic in soilTypeFAO['children']]
soiltypes += [a + 's' for a in soiltypes]
faoPattern = '({:s})?\s?({:s})?\s?({:s})'.format(
        '|'.join(qualifiers), '|'.join(qualifiers), '|'.join(soiltypes))

# WRB pattern
qualifiers = []
for dic in soilTypeQualifiersWRB:
    qualifiers.extend(dic['principal_qualifiers'])
    qualifiers.extend(dic['supplementary_qualifiers'])
qualifiers = list(set(qualifiers))
qualifiers = [a.strip() for a in qualifiers]
soiltypes = [dic['value'].strip() for dic in soilTypeWRB['children']]
soiltypes += [a + 's' for a in soiltypes]
wrbPattern = '({:s})?\s?({:s})?\s?({:s})'.format(
    '|'.join(qualifiers), '|'.join(qualifiers), '|'.join(soiltypes))

# USDA pattern
soiltypes = []
for a in soilTypeUSDA['children']:
    soiltypes.append(a['value'])
    if 'children' in a.keys():
        for b in a['children']:
            soiltypes.append(b['value'])
            if 'children' in b.keys():
                for c in b['children']:
                    soiltypes.append(c['value'])
soiltypes += [a[:-1] for a in soiltypes if a[-1] == 's']
qualifiers = [dic['value'] for dic in soilTypeQualifiersUSDA]
usdaPattern = '({:s})?\s?({:s})?\s?({:s})'.format(
    '|'.join(qualifiers), '|'.join(qualifiers), '|'.join(soiltypes))

# TODO 'podzolic soil' -> add adjective version of word too for WRB

def getSoilType(doc):
    text = doc.replace('\n', ' ')
    soilmatches = re.findall(wrbPattern, text)
    soilmatches += re.findall(faoPattern, text)
    soilmatches += re.findall(usdaPattern, text)
    soiltypes = list(set([' '.join(a).replace('  ', ' ').strip() for a in soilmatches]))
    return soiltypes

# test
# getSoilType(doc)

[]

In [8]:
# extract soil texture
voc = [
    'sandy clay loam', 'silty clay loam',
    'loamy sand', 'sandy loam', 'clay loam', 'sandy clay',
    'silty clay', 'silt loam',
    'sand', 'silt', 'clay', 'loam'
]
    
def getSoilTexture(doc):
    text = doc.lower()
    # remove sand occurence usually applied for better contact with permeameter
    text = re.sub('(fine\s+sand|moist\s+\sand|contact\ssand|sand\s+pad|washed\s+sand|(sand|clay|silt)\s+content)', '', text)
    matches = re.findall('|'.join(voc), text)  # maybe case would help here actually
    umatches = np.unique(matches).tolist()
    return umatches

# test
# getSoilTexture(doc)

['clay', 'clay loam', 'loam', 'sand', 'silt']

In [9]:
# extract rainfall
def getRainfall(doc):
    text = doc.replace('\n',' ').lower()
    text = re.sub('\([a-z\s]+\., \d{4}\)', '', text)  # remove citation in parenthesis 
    text = text = re.sub('[^0-9a-z.,\s\-–]', '', text)  # remove other characters
    matches = re.findall('((?:cumulated|annual|average)[a-z\s]+(?:rainfall|rain|precipitation))(?:[a-z\s]+)?(\d+[.-\–]?\d+c)?[a-z\s]+(\d+\.?,?\d+(?:-|–|\sand\s|\sto\s)?(?:\d+)?)\s?(m\s?m|cm)', text)
    umatches = []
    for a in matches:
        if a[-1] == 'mm':
            s = a[-2]
        elif a[-1] == 'cm':
            s = '{:.0f}'.format(float(a[-2])*10)
        if ('-' in s) or ('–' in s) or ('to' in s):
            s = '{:.0f}'.format(np.mean([float(a) for a in re.split('-|–|to', s)]))
        s = s.replace(',', '')
        if 'and' in s:
            umatches.append(s.split(' and ')[0])
            umatches.append(s.split(' and ')[1])
        else:
            umatches.append(s)
    return umatches

# test
# getRainfall(text)

['855']

In [10]:
# extract altitude/elevation
def getElevation(text):
    match1 = re.findall('((\d+)\s?m[a-z\s]+(altitude|elevation))', text.replace('\n',' ').lower())
    match2 = re.findall('((altitude|elevation)[a-z\s]+(\d+)\s?m)', text.replace('\n',' ').lower())
    match = match1 + match2
    doc = text.replace('\n', ' ').lower()
    matchd = [doc[m.start()-20:m.end()+20] for m in re.finditer('(altitude|elevation)', doc)]
    return matchd

# test
# getElevation(doc)

[]

In [None]:
# extract diameter
def getDiameter(doc):
    text = doc.replace('\n',' ').lower()
    match1 = re.findall('(radius|diameter)[a-z\s]+(\d+\.?\d+)\s?(cm|mm)', text)
    match2 = re.findall('(\d+\.?\d+)\s?(cm|mm)[a-z\s]+(radius|diameter)', text)
    match = match1 + match2
    umatches = []
    for a in match1:
        val = float(a[1])
        val = val/10 if a[2] == 'mm' else val
        val = val * 2 if a[0] == 'radius' else val
        umatches.append(val)
    for a in match2:
        val = float(a[0])
        val = val/10 if a[1] == 'mm' else val
        val = val * 2 if a[2] == 'radius' else val
        umatches.append(val)
    return umatches

# test
# getDiameter(doc)

[]

In [None]:
# extract tensions
def getTensions(doc):
    text = doc.replace('\n', ' ').lower()
    #matches = re.findall('(\-?\s?\d{1,3},\s)+\s+and\s+(\-?\s?\d{1,3}\s(mm|cm))', text)
    text = text.replace(' and ', ', ')
    matches = re.findall('((?:(?:-?\d+),?\s){2,})\s?(mm|cm)', text)
    tt = []
    if len(matches) > 0:
        if ',' in matches[0][0]:
            tt = np.abs([float(a.strip()) for a in matches[0][0].split(',')]).tolist()
            if matches[0][-1] == 'cm':
                tt = [10*t for t in tt]
            tmin = np.min(tt)
            tmax = np.max(tt)
    return tt

# test
# getTensions(doc)

[120.0, 60.0, 30.0, 10.0]

In [None]:
# extract abbreviations
def getAbbreviations(text):
    dfabb = pd.DataFrame(columns=['abbreviation', 'meaning'])
    abrvs = [a for a in re.findall(r'(\([A-Z]*\))', text) if len(a) > 3]
    for abrv in abrvs:
        meaning = ' '.join(re.split(' |\\n', text.split(abrv)[0])[-(len(abrv)-1):])
        dfabb = dfabb.append({'abbreviation': abrv[1:-1],
                              'meaning': meaning}, ignore_index=True)
    dfabb = dfabb.drop_duplicates().reset_index(drop=True)
    return dfabb

# test
# getAbbreviations(doc)

Unnamed: 0,abbreviation,meaning
0,BEST,of Soil Transfer parameters
1,PI,pressure inﬁltrometer
2,SFH,simpliﬁed falling head
3,TI,tension inﬁltrometer
4,MDI,mini disk inﬁltrometer
5,BB,bottomless bucket
6,PSD,particle size distribution
7,TPD,with the TwoPonding-Depth


In [None]:
# extract coordinates

# convert to decimal degree
def dms2dec(out):
    # string, deg, symbol, minute, symbol, second, symbol, direction
    deg = float(out[1]) if out[1] != '' else 0
    mnt = float(out[3]) if out[3] != '' else 0
    sec = float(out[5]) if out[5] != '' else 0
    dec = deg + mnt/60 + sec/60/60
    dec = -dec if (out[-1] == 'S')|(out[-1] == 'W')|(out[-1] == 'O') else dec
    return dec

def getCoordinates(text):
    text = text.replace('\n',' ').replace('\x04','*').replace('\x03','*')
    text = text.replace('\x01','*').replace(',','.') # \x04 is EOT End Of Transmission
    latTime = re.findall("(([+-]?[1-8]?\d|[+-]?90)([°◦*oO])\s?(\d{1,2})(.)\s?(\d{1,2}(\.\d+)?)?.?.?\s?(latitude\s)?([NS]))", text)
    lonTime = re.findall("(([+-]?180|[+-]?1[0-7]\d|[+-]?[1-9]?\d)([°◦*oO])\s?(\d{1,2}?)(.)\s?(\d{1,2}(\.\d+)?)?.?.?\s?(longitude\s)?([WEO]))", text)

    # sometimes the symbol for minutes is transformed in a zero
    if len(latTime) > 0:
        for l in range(len(latTime)):
            a = latTime[l]
            if a[3] == ' ':  # we would expect the symbol for minute
                latTime[l] = a[:2] + (latTime[l][2][:-1],) + a[3:]  # remove last symbol
    if len(lonTime) > 0:
        for l in range(len(lonTime)):
            a = lonTime[l]
            if a[3] == ' ':  # we would expect the symbol for minute
                lonTime[l] = a[:2] + (lonTime[l][2][:-1],) + a[3:]  # remove last symbol

    # try some popular edge cases
    if len(latTime) == 0:
        latTime = re.findall("((\d+)(8)(\d{2,}))(['0V9])\s?(\d+\.\d+)?()\s?(latitude\s)?([NS])", text)
        lonTime = re.findall("((\d+)(8)(\d{2,}))(['0V9])\s?(\d+\.\d+)?()\s?(longitude\s)?([WOE])", text)
        
    if len(latTime) == 0:
        latTime = re.findall("(([NS])\s(\d+)(8)(\d{2,}))(['0V9])\s?(\d+\.\d+)?()", text)
        lonTime = re.findall("(([WOE])\s(\d+)(8)(\d{2,}))(['0V9])\s?(\d+\.\d+)?()", text)
        latTime = [a[1:] + (a[-1],) for a in latTime]
        lonTime = [a[1:] + (a[-1],) for a in lonTime]

    # convert to decimal degree
    lats = [dms2dec(a) for a in latTime]
    lons = [dms2dec(a) for a in lonTime]

    # try to match decimal notation
    if len(lats) == 0:
        latDeg = re.findall("([+-]?(([1-8]?\d|90)\.\d+)[°◦*oO⬚]?\s?([NS]))", text)
        lonDeg = re.findall("([-+]?((180|1[0-7]\d|[1-9]?\d)\.\d+)[°◦*oO⬚]?\s?([WEO]))", text)
        lats = [float(a[1]) if a[3] == 'N' else -float(a[1]) for a in latDeg]
        lons = [float(a[1]) if a[3] == 'E' else -float(a[1]) for a in lonDeg]

    # edge case of decimal notation
    if len(lats) == 0:
        latDeg = re.findall("(lat\.\s.(([+-]?[1-8]?\d|[+-]?90)\.\d+)[°◦*oO⬚]?)", text)
        lonDeg = re.findall("(long\.\s.(([-+]?180|[-+]?1[0-7]\d|[-+]?[1-9]?\d)\.\d+)[°◦*oO⬚]?)", text)
        lats = [float(a[1]) for a in latDeg]
        lons = [float(a[1]) for a in lonDeg]
        
    # rounding for easier comparison
    lats = [np.round(a, 5) for a in lats]
    lons = [np.round(a, 5) for a in lons]
    
    return [(a, b) for a, b in zip(lats, lons)]

# show map
def showMap(coords, out):
    m = Map(center=(0,0), zoom=2)#center=(52.204793, 360.121558), zoom=15)
    for coord in coords:
        marker = Marker(location=coord)
        m.add_layer(marker)
    with out:
        display(m)

# test
# coords = getCoordinates(doc)
# print(coords)
# out = widgets.Output()
# showMap(coords, out)
# out

[(38.10694, 13.35167)]


In [23]:
# extract other practices information
dfsyn = pd.read_csv(datadir + '../dfsyn.csv')  # load original words and their synonyms

def getPractices(doc):
    dfpra = pd.DataFrame(columns=['practice', 'occurence'])
    lemmas = dfsyn['lemma'].unique()
    for lemma in lemmas:
        matches = re.findall(lemma, doc.lower())
        if len(matches) > 0:
            dfpra = dfpra.append({'practice': lemma, 'occurence': len(matches)}, ignore_index=True)
    dfpra = dfpra.sort_values('occurence', ascending=False).reset_index(drop=True)
    return dfpra

# test
# getPractices(doc)

Unnamed: 0,practice,occurence
0,crop,4


In [13]:
# could add automatic DOI detection and cross-ref info? but not really NLP

In [24]:
# your new extraction function
def getInfo(text):
    info = re.findall('\d+', text)[0]
    return info

In [67]:
def prepareDownload(df, name='Download File', fname='df.csv'):
    t0 = time.time()
    with tempfile.TemporaryDirectory() as td:
        fpath = os.path.join(td + fname)
        df.to_csv(fpath, index=False)  # takes some time but csv takes more time to render in base64
        #with ZipFile(fpath.replace('.csv', '.zip'), mode='w') as myzip:
        #    myzip.write(fpath, arcname=fname.replace('.csv', ''))
        with open(fpath,  'rb') as f:
            data = f.read()
    b64 = base64.b64encode(data)
    payload = b64.decode()

    html_button = '''
    <a download="{fname}" href="data:text/csv;base64,{payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-info">{name}</button>
    </a>
    '''.format(payload=payload, fname=fname, name=name)

    return HTML(html_button)

#DownloadDownload(df)

In [57]:
# extract metadata
def getMetaData(fo):
    data = {}
    sdata = []  # possibly get a table with two columns: metadata and values
    # not stacked in cell (but then problem with mixed type)
    text = pdf2text(fo)
    doc, abstract = getAbstract(text)
    metadata = {
        'Soil type (WRB/USDA)': getSoilType,
        'Soil texture (USDA)': getSoilTexture,
        'Disk diameter [mm]': getDiameter,
        'Tensions [mm]': getTensions,
        'Elevation [masl]': getElevation,
        'Rainfall [mm/year]': getRainfall,
        'Coordinates [degree]': getCoordinates,
        # add new function here
    }
    for col in metadata:
        func = metadata[col]
        res = func(doc)  # apply the function to the document
        out = ', '.join(str(a) for a in res)
        if col == 'Coordinates [degree]':
            coords = out
            out = '; '.join(['({:.5f}, {:.5f})'.format(a, b) for a, b in res])
            data['Latitude [degree]'] = ', '.join(['{:.5f}'.format(a[0]) for a in res])
            data['Longitude [degree]'] = ', '.join(['{:.5f}'.format(a[1]) for a in res])
            for a, b in res:
                sdata.append(['Latitude [degree]', a])
                sdata.append(['Longitude [degree]', b])
                sdata.append([col, '({:.5f}, {:.5f})'.format(a, b)])
        else:
            data[col] = [out]
            for a in res:
                sdata.append([col, a])
    #dfmeta = pd.DataFrame(data)
    dfmeta = pd.DataFrame(sdata, columns=['metadata', 'value'])
    dfabb = getAbbreviations(doc)
    dfpra = getPractices(doc)
    return dfmeta, dfabb, dfpra, abstract, coords

# test
# with open('../data/from-kunsat/papers/alagna2016.pdf', 'rb') as f:
#     fo = f.read()
# getMetaData(fo)[0]

- download all tables as .csv (using the html data tab)
- download all converted text (as zip)
- drop-down menu to change which pdf to display
- nice way of adding a new function

In [78]:
dropdown.unobserve(showDocument)

In [96]:
# interface

# create global variables
dfmetas = pd.DataFrame()
dfabbs = pd.DataFrame()
dfpras = pd.DataFrame()
dfabstracts = pd.DataFrame()

# function automatically called on upload for processing pdfs
def uploadBtnFunc(_):
    global dfmetas, dfabbs, dfpras, dfabstracts
    download.clear_output()
    output.clear_output()
    dropdown.unobserve(showDocument)    
    with download:
        t0 = time.time()
        print('processing...', end='')
    if len(uploadBtn.data) > 0:
        options = []
        n = len(uploadBtn.data)
        dfmetas, dfabbs, dfpras, abstracts = [], [], [], []
        fname0 = ''
        for i in range(n):
            if uploadBtn.metadata[i]['type'] == 'application/pdf':
                fname = uploadBtn.metadata[i]['name']
                if fname0 == '':
                    fname0 = fname
                dfmeta, dfabb, dfpra, abstract, coords = getMetaData(uploadBtn.data[i])
                dfmeta['fname'] = fname
                dfmetas.append(dfmeta)
                dfabb['fname'] = fname
                dfabbs.append(dfabb)
                dfpra['fname'] = fname
                dfpras.append(dfpra)
                abstracts.append([fname, abstract])
                with download:
                    print('\rprocessing...', i+1, '/', n, end='')
                options.append(fname)
            else:
                print('only PDF file accepted')
        dropdown.options = options

        # create large dataframe
        dfmetas = pd.concat(dfmetas, axis=0)
        dfabbs = pd.concat(dfabbs, axis=0)
        dfpras = pd.concat(dfpras, axis=0)
        dfabstracts = pd.DataFrame(abstracts, columns=['fname', 'abstract'])
        
        with download:
            display(prepareDownload(dfmetas, 'Metadata', 'dfmetadata.csv'))
            display(prepareDownload(dfabbs, 'Abbreviations', 'dfabbreviations.csv'))
            display(prepareDownload(dfpras, 'Practices', 'dfpractices.csv'))
            display(prepareDownload(dfabstracts, 'Abstracts', 'dfabstracts.csv'))
            print('done ({:.2f}s)'.format(time.time() - t0))

        dropdown.observe(showDocument, names='value')
        showDocument({'new': fname0})
        uploadBtn.value.clear()

def showDocument(a):
    global dfmetas, dfabbs, dfpras, dfabstracts
    fname = a['new']
    output.clear_output()
    with output:
        display(dfmetas[dfmetas['fname'] == fname])
        display(dfabbs[dfabbs['fname'] == fname])
        display(dfpras[dfpras['fname'] == fname])
        lats = dfmetas[dfmetas['fname'].eq(fname) & dfmetas['metadata'].eq('Latitude [degree]')]['value']
        lons = dfmetas[dfmetas['fname'].eq(fname) & dfmetas['metadata'].eq('Latitude [degree]')]['value']
        coords = [(float(a), float(b)) for a, b in zip(lats, lons)]
        showMap(coords, output)
        print('ABSTRACT:')
        print(dfabstracts[dfabstracts['fname'] == fname]['abstract'])
        
uploadBtn = widgets.FileUpload(description='Upload pdf', multiple=True)
uploadBtn.observe(uploadBtnFunc, names='value')

dropdown = widgets.Dropdown(description='Show')

download = widgets.Output()
output = widgets.Output()

# layout
widgets.VBox([uploadBtn, download,  dropdown, output])

VBox(children=(FileUpload(value={}, description='Upload pdf', multiple=True), Output(), Dropdown(description='…

In [89]:
showDocument({'new':'alagna2016.pdf'})

alagna2016.pdf


In [71]:
dfmetas[dfmetas['fname'].eq('alagna2016.pdf')]

Unnamed: 0,metadata,value,fname
0,Soil texture (USDA),clay,alagna2016.pdf
1,Soil texture (USDA),clay loam,alagna2016.pdf
2,Soil texture (USDA),loam,alagna2016.pdf
3,Soil texture (USDA),sand,alagna2016.pdf
4,Soil texture (USDA),silt,alagna2016.pdf
5,Tensions [mm],120.0,alagna2016.pdf
6,Tensions [mm],60.0,alagna2016.pdf
7,Tensions [mm],30.0,alagna2016.pdf
8,Tensions [mm],10.0,alagna2016.pdf
9,Rainfall [mm/year],855,alagna2016.pdf
