In [1]:
import glob
import pandas as pd
import bs4 as bs
from bs4 import BeautifulSoup
import lxml as lx
from bs4 import NavigableString

#ETL of original xml files into pandas DataFrames.

In [2]:
xml_filenames=glob.glob("*.xml")
df=pd.DataFrame(xml_filenames,columns=['xml_filenames'])

strings=[]
notBefore=[]
notAfter=[]

for filenm in xml_filenames:
    infile = open(filenm,"r")
    contents = infile.read()
    soup = BeautifulSoup(contents, "xml")

    
    #get strings
    
    edition=soup.find(type='edition')
    
    if edition is None:
        strings.append("no value") #no text; these are often drawings 
        print(filenm, ' found no text')
    elif 'lang' not in edition.attrs: #no greek text
        strings.append("no value")
        print(filenm, ' found no lang')
    elif edition['lang']=='la': #is latin
        strings.append("no value")
        print(filenm, ' inscription is latin')

    else:
        this_string=""
        text=edition.find_all(['ab','lg'])
        for section in text:
            for child in section.descendants:
                if child.parent.name == "note" or child.name=="note": #omit english notes
                    continue
                elif child.name=="gap": #include original word spacing
                    this_string=this_string+" "
                elif type(child) is NavigableString and child.string is not None: #omit usage error
                    if child.string != "No text" or "o text":
                        this_string=this_string+child.string
        if this_string is not "":
            strings.append(this_string)
        else:
            strings.append("no value")
            print(filenm, ' found no text')
            
            
    #get date info
    
    dsoup=soup.find(n="date")

    #earliest possible date of inscription
    tag=dsoup.find(notBefore=True)
    if tag is None:
        notBefore.append('no value')
    else:
        notBefore.append(tag.get('notBefore'))

    #latest possible date 
    tag=dsoup.find(notAfter=True)
    if tag is None:
        notAfter.append('no value')
    else:
        notAfter.append(tag.get('notAfter'))

        
#takes about 20s on home computer

iAph120315.xml  found no text
iAph020011.xml  found no text
iAph150225.xml  found no text
iAph100009.xml  found no text
iAph100022.xml  found no text
iAph100005.xml  found no text
iAph020012.xml  found no text
iAph080062.xml  found no text
iAph020524.xml  inscription is latin
iAph100019.xml  found no text
iAph110210.xml  found no text
iAph010401.xml  found no text
iAph010201.xml  inscription is latin
iAph100023.xml  found no text
iAph100020.xml  found no text
iAph100018.xml  found no text
iAph040304.xml  inscription is latin
iAph080093.xml  found no text
iAph110063.xml  inscription is latin
iAph040005.xml  found no text
iAph080401.xml  found no text
iAph100008.xml  found no text
iAph080091.xml  found no text
iAph100016.xml  found no text
iAph140005.xml  found no lang
iAph090115.xml  found no text
iAph110201.xml  found no text
iAph080601.xml  found no text
iAph080053.xml  found no text
iAph040002.xml  found no text
iAph050001.xml  found no text
iAph060003.xml  found no text
iAph020401.x

In [3]:
df['string']=strings
df['date_notBefore']=notBefore
df['date_notAfter']=notAfter
df=df[df['string']!="no value"]

df.set_value(938,'date_notAfter','0225') #fixing a typo specific to this set

df.to_csv('collected_inscription_data.csv')


In [4]:
df[df['xml_filenames']=='iAph010003.xml']
#test for inclusion of strings from all ab sections in "type='edition'"
#(010003 has two,separated by a period)

Unnamed: 0,xml_filenames,string,date_notBefore,date_notAfter
463,iAph010003.xml,λαιιβ οἱ οἰκοδόμοι. ἐπιμεληθέντος,1,1000


In [5]:
df[df['xml_filenames']=='iAph080270.xml']
#no ab section. extensive grk text in 'lg'. confirming the greek text is loaded 

Unnamed: 0,xml_filenames,string,date_notBefore,date_notAfter
88,iAph080270.xml,\n\n τύμβος Ἰορδά\nνοιο τὸν Εὐλαλί\nου ἀπὸ φύτ...,401,600


In [6]:
df[df['xml_filenames']=='iAph100006.xml']
#both greek and english sections in 'edition'. test for exclusion of english text nested in 'note' sections.


Unnamed: 0,xml_filenames,string,date_notBefore,date_notAfter
374,iAph100006.xml,μοστρατου\n \nΒ\nΙ\nΟ\nΙ\nΛ σιέων \n\nἈπολ...,101,500


In [7]:
#saving data in this intermediate form for use as needed:


#separating out undated inscriptions for later use
undated=df[(df['date_notAfter']=="no value")|(df['date_notBefore']=="no value")]
undated.to_csv('undated_inscriptions.csv')


#removing undated inscriptions from set in current use; converting dates to numeric value
#for sorting
dated=df[(df['date_notAfter']!="no value")&(df['date_notBefore']!="no value")]
dated['date_notBefore']=pd.to_numeric(dated['date_notBefore'])
dated['date_notAfter']=pd.to_numeric(dated['date_notAfter'])


#separating out inscriptions with a possible date range greater than 100yrs
weakly_dated=dated[dated['date_notAfter']-dated['date_notBefore']>100]
dated=dated[dated['date_notAfter']-dated['date_notBefore']<100]


#saving all sets
weakly_dated.to_csv('weakly_dated_inscriptions.csv')
dated.to_csv('dated_inscriptions.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [8]:
#the data currently loaded:
dated

Unnamed: 0,xml_filenames,string,date_notBefore,date_notAfter
4,iAph080216.xml,ΛΗΣ,1,100
5,iAph110065.xml,ἕτερος δὲ οὐδεὶς ἕξει ἐξουσίαν ἐνθάψε τινὰ ...,301,400
6,iAph090039.xml,Ἰουλίαν Σεβαστὴν Σεβαστοῦ θυγατέρα Ἥραν,14,69
15,iAph010189.xml,ἡ πόλις ον Λικίννιον Οὐαλεριανὸν υἱὸν καὶ ἀδελ...,253,260
19,iAph080604.xml,τοῦτο\n νικᾳ.,301,400
22,iAph120511.xml,ἡ βουλή,1,100
24,iAph110057.xml,τὸν ἀξιολογώτατον πρῶτον ἄρχοντα δὶς ταμ...,212,300
25,iAph050119.xml,\n Παλλάδιος ἐ \nποίει καὶ ἀνέθηκεν,301,350
26,iAph120706.xml,\nἡ βουλὴ καὶ ὁ δῆμος ἐτείμησαν\n ταῖς καλλίστ...,34,67
34,iAph150315.xml,οὐδεὶς δὲ ἕτερος ἕξει ἐξουσίαν ἐνθαψαι τινα...,101,200


In [20]:
#narrowing it down further - arbitrary minimum length
testset=dated[dated['string'].apply(len)>400]
testset.to_csv('testset.csv')
#this is our only usable source of test data!  text samples may be useful for
#characterizing vowel distribution at a date but may not be good representations of the period 
#for test purposes 

In [21]:
testset

Unnamed: 0,xml_filenames,string,date_notBefore,date_notAfter
37,iAph110060.xml,παῖδα τῶν εὖ γεγονότων νείκησαντα ἐνδόξως...,241,241
44,iAph120320.xml,ἡ ἐπικειμένη τῷ θωρακείῳ σορός ἐστιν Σωτείρ...,101,200
60,iAph110110.xml,ισλ δαπανήσαντα ἀπὸ δηνάρια μυρίων εἰρηναρχή...,201,300
93,iAph120646.xml,Σαλλουστίαν Φροντεῖναν Σαλλουστίου Ῥούφου συνκ...,101,199
96,iAph120920.xml,ἔδοξε τῇ ἱερᾷ ξυστικῇ περιπολιστικῇ συνόδῳ τῶ...,138,169
120,iAph120322.xml,ὑπὸ Νεικομάχου ἀρχιερέως καὶ ἐν τῇ ...,168,233
128,iAph130005.xml,ἡ βουλὴ καὶ ὁ δῆμος καὶ ἡ γερουσία ἐτείμησαν...,151,200
162,iAph130147.xml,τοῦ Φοίβου φύσι δὲ Μενάνδρου β τοῦ Ἀγαθόποδος...,168,250
175,iAph080100.xml,αὐτοκράτωρ Καῖσαρ Μᾶρκος Ἀντώνιος Γορδιανὸς ...,238,244
178,iAph121111.xml,ἡ βουλὴ καὶ ὁ δῆμος ὁ Ἀφροδισιέων καὶ ἡ γερου...,101,200
