# Extract metadata from source tei

In [7]:
#get the file names/paths
import glob
tei_files = glob.glob("../tei/*.xml")
len(tei_files)

852

In [9]:
tei_files[0]

'../tei/A36762.xml'

In [10]:
import xml.etree.ElementTree as ET

In [59]:
def extract_metadata(file:str):
    xml = ET.parse(file)
    xml = xml.getroot()
    xmlns = {"tei": "http://www.tei-c.org/ns/1.0", "ep" : "http://earlyprint.org/ns/1.0" }
    
    metadata = {}
    metadata["filepath"] = file
    metadata["filename"] = file.split("/")[-1:][0]
    
    metadata["ep_id"] = metadata["filename"].split(".xml")[0]
    
    metadata["header_title"] = xml.find("tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title", xmlns).text
     
    ep_title = xml.find("tei:teiHeader/tei:xenoData/ep:epHeader/ep:title", xmlns)
    if ep_title != None:
        metadata["ep_title"] = ep_title.text
    
    authors = []
    ep_authors = xml.findall("tei:teiHeader/tei:xenoData/ep:epHeader/ep:author", xmlns)
    for author in ep_authors:
        author_names = author.findall("ep:name", xmlns)
        for author_name in author_names:
            authors.append(author_name.text)
    
    metadata["authors"] = authors
    
    ep_genre = xml.find("tei:teiHeader/tei:xenoData/ep:epHeader/ep:genre", xmlns)
    if ep_genre != None:
        metadata["ep_genre"] = ep_genre.text
        
    ep_subgenre = xml.find("tei:teiHeader/tei:xenoData/ep:epHeader/ep:subgenre", xmlns)
    if ep_subgenre != None:
        metadata["ep_subgenre"] = ep_subgenre.text
    
    author_birth = xml.find("tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblFull/tei:titleStmt/tei:author",xmlns)
    if author_birth != None:
        metadata["author_birth"] = author_birth.text
    
    
    
    return metadata

In [60]:
extract_metadata(tei_files[0])

{'filepath': '../tei/A36762.xml',
 'filename': 'A36762.xml',
 'ep_id': 'A36762',
 'header_title': "Psyche debauch'd a comedy : as it was acted at the Theatre-Royal / by T.D.",
 'ep_title': 'Psyche Debauched',
 'authors': ['Duffett, Thomas'],
 'author_birth': 'Duffett, Thomas.'}

In [61]:
extract_metadata("../tei/A59990.xml")

{'filepath': '../tei/A59990.xml',
 'filename': 'A59990.xml',
 'ep_id': 'A59990',
 'header_title': 'The Politician',
 'ep_title': 'The Politician',
 'authors': ['Shirley, James'],
 'ep_genre': 'play',
 'ep_subgenre': 'tragedy',
 'author_birth': 'Shirley, James, 1596-1666.'}

In [63]:
metadata = []
for item in tei_files:
    data = extract_metadata(item)
    metadata.append(data)

In [64]:
metadata[0]

{'filepath': '../tei/A36762.xml',
 'filename': 'A36762.xml',
 'ep_id': 'A36762',
 'header_title': "Psyche debauch'd a comedy : as it was acted at the Theatre-Royal / by T.D.",
 'ep_title': 'Psyche Debauched',
 'authors': ['Duffett, Thomas'],
 'author_birth': 'Duffett, Thomas.'}

In [74]:
import csv
with open('../meta/author_titles.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerow(["id","filename","title","authors","author_full"])
    for item in metadata:
        if "author_birth" not in item:
            item["author_birth"] = ""
        authors_concat = ";".join(item["authors"])
        row = [item["ep_id"],item["filename"],item["ep_title"],authors_concat,item["author_birth"]]
        writer.writerow(row)