In [335]:
import xml.etree.ElementTree as ET
import glob,os
import pandas as pd
import os

In [311]:
DIR = "/Users/damoncrockett/ivpy-datasets/ycba/metadata/"

In [312]:
lidofiles = glob.glob(os.path.join(DIR,"*.xml"))

In [313]:
ns = {'lido':'http://www.lido-schema.org','OA':'http://www.openarchives.org/OAI/2.0/'}

In [314]:
cols = [
    'Genre',
    'Object name',
    'Classification',
    'medium',
    'support',
    'subjectConcept'
]

In [319]:
def lidoparse(lidofile):
    datarow = dict.fromkeys(cols)
    
    tree = ET.parse(lidofile)
    root = tree.getroot()
    
    # metadata
    GetRecord = root.find('OA:GetRecord',ns)
    record = GetRecord.find('OA:record',ns)
    metadata = record.find('OA:metadata',ns)
    
    # descriptive metadata
    lido = metadata.find('lido:lido',ns)
    descriptiveMetadata = lido.find('lido:descriptiveMetadata',ns)
    
    # main descriptive metadata categories
    objectClassificationWrap = descriptiveMetadata.find('lido:objectClassificationWrap',ns)
    objectIdentificationWrap = descriptiveMetadata.find('lido:objectIdentificationWrap',ns)
    eventWrap = descriptiveMetadata.find('lido:eventWrap',ns)
    objectRelationWrap = descriptiveMetadata.find('lido:objectRelationWrap',ns)
    
    #----object classification
    objectWorkTypeWrap = objectClassificationWrap.find('lido:objectWorkTypeWrap',ns)
    classificationWrap = objectClassificationWrap.find('lido:classificationWrap',ns)
    
    #--------object work types
    objectWorkTypes = objectWorkTypeWrap.findall('lido:objectWorkType',ns)    
    for objectWorkType in objectWorkTypes:
        conceptID = objectWorkType.find('lido:conceptID',ns).get('{http://www.lido-schema.org}type')
        term = objectWorkType.find('lido:term',ns).text
        if conceptID in cols:
            datarow[conceptID] = term
        
    #--------classification
    classifications = classificationWrap.findall('lido:classification',ns)    
    for classification in classifications:
        conceptID = classification.find('lido:conceptID',ns).get('{http://www.lido-schema.org}type')
        term = classification.find('lido:term',ns).text
        if conceptID in cols:
            datarow[conceptID] = term
    
    #----object identification [OMITTED]
        
    #----event [MOSTLY OMITTED]
    eventSet = eventWrap.find('lido:eventSet',ns)
    event = eventSet.find('lido:event',ns)
    eventMaterialsTech = event.find('lido:eventMaterialsTech',ns)
    materialsTech = eventMaterialsTech.find('lido:materialsTech',ns)
    
    #--------term materials tech
    termMaterialsTechs = materialsTech.findall('lido:termMaterialsTech',ns)
    for termMaterialsTech in termMaterialsTechs:
        conceptID = termMaterialsTech.find('lido:conceptID',ns).get('{http://www.lido-schema.org}type')
        term = termMaterialsTech.find('lido:term',ns).text
        if conceptID in cols:
            datarow[conceptID] = term
    
    #----object relation [CHECK WHETHER MULTIPLE SUBJECTS WITHIN SET]
    subjectWrap = objectRelationWrap.find('lido:subjectWrap',ns)
    subjectSet = subjectWrap.find('lido:subjectSet',ns)
    subject = subjectSet.find('lido:subject',ns)
    
    #--------subject concepts
    subjectConcepts = subject.findall('lido:subjectConcept',ns)    
    
    subjectConceptsList = []
    for subjectConcept in subjectConcepts:
        term = subjectConcept.find('lido:term',ns).text
        subjectConceptsList.append(term)
        
    datarow['subjectConcept'] = ','.join(subjectConceptsList)
    
    return datarow

In [324]:
df = pd.DataFrame(columns=cols)

In [325]:
n = len(lidofiles)

In [326]:
for i in range(n):
    try:
        df.loc[i] = lidoparse(lidofiles[i])
    except Exception as e:
        print e

In [328]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31262 entries, 0 to 31261
Data columns (total 6 columns):
Genre             19286 non-null object
Object name       20316 non-null object
Classification    31262 non-null object
medium            15628 non-null object
support           19007 non-null object
subjectConcept    31262 non-null object
dtypes: object(6)
memory usage: 1.7+ MB


In [334]:
df['datafile'] = lidofiles

In [338]:
PRE = '/Users/damoncrockett/ivpy-datasets/ycba/images/'
df['imagefile'] = [PRE+os.path.basename(item)[:-4]+".jpg" for item in df.datafile]

In [339]:
df.to_csv("/Users/damoncrockett/ivpy-datasets/ycba/metadata.csv",index=False,encoding='utf-8')

### Work

In [288]:
tree = ET.parse(lidofiles[0])
root = tree.getroot()
for child in root:
    print child.tag

{http://www.openarchives.org/OAI/2.0/}responseDate
{http://www.openarchives.org/OAI/2.0/}request
{http://www.openarchives.org/OAI/2.0/}GetRecord


In [289]:
tmp = root.find('OA:GetRecord',ns)
for child in tmp:
    print child.tag

{http://www.openarchives.org/OAI/2.0/}record


In [290]:
tmp = tmp.find('OA:record',ns)
for child in tmp:
    print child.tag

{http://www.openarchives.org/OAI/2.0/}header
{http://www.openarchives.org/OAI/2.0/}metadata


In [291]:
tmp = tmp.find('OA:metadata',ns)
for child in tmp:
    print child.tag

{http://www.lido-schema.org}lido


In [292]:
tmp = tmp.find('lido:lido',ns)
for child in tmp:
    print child.tag

{http://www.lido-schema.org}lidoRecID
{http://www.lido-schema.org}category
{http://www.lido-schema.org}descriptiveMetadata
{http://www.lido-schema.org}administrativeMetadata


In [293]:
tmp = tmp.find('lido:descriptiveMetadata',ns)
for child in tmp:
    print child.tag

{http://www.lido-schema.org}objectClassificationWrap
{http://www.lido-schema.org}objectIdentificationWrap
{http://www.lido-schema.org}eventWrap
{http://www.lido-schema.org}objectRelationWrap


In [294]:
tmp = tmp.find('lido:objectRelationWrap',ns)
for child in tmp:
    print child.tag

{http://www.lido-schema.org}subjectWrap
{http://www.lido-schema.org}relatedWorksWrap


In [295]:
tmp = tmp.find('lido:subjectWrap',ns)
for child in tmp:
    print child.tag

{http://www.lido-schema.org}subjectSet


In [296]:
tmp = tmp.find('lido:subjectSet',ns)
for child in tmp:
    print child.tag

{http://www.lido-schema.org}subject


In [297]:
tmp = tmp.find('lido:subject',ns)
for child in tmp:
    print child.tag

{http://www.lido-schema.org}subjectConcept
{http://www.lido-schema.org}subjectConcept
{http://www.lido-schema.org}subjectConcept
{http://www.lido-schema.org}subjectConcept
