# Parse scielo XML

In [19]:
from lxml import etree

class Scielo_XML(object):

    def __init__(self, url_xml):
        self.tree = etree.parse(url_xml)
        self.scielo_id = self.tree.xpath('/article/front/article-meta/article-id')[0].text

    def get_title(self):
        t=self.tree.xpath('/article/front/article-meta/title-group/article-title')
        titulo = {}
        titulo['scielo_id'] = self.scielo_id
        if t == []:
            n= ''
        else:
            n= t[0].text
        titulo['title'] = n
        return titulo

    def get_abstract(self):
        a=self.tree.xpath('/article/front/article-meta/abstract[@xml:lang="pt"]')
        resumo={}
        resumo['scielo_id'] = self.scielo_id
        if a == []:
            r = ''
        else:
            r =(a[0][0].text)
        resumo['abstract'] = r
        return resumo

    def get_keywords(self):
        ks=self.tree.xpath('/article/front/article-meta/kwd-group/kwd[@lng="pt"]') 
        keywords = []
        if ks == []:
            return []
        else:
            for k in ks:
                keywords.append(k.text)
            return keywords

    def get_autores(self):
        autors=self.tree.xpath('/article/front/article-meta/contrib-group/contrib[@contrib-type="author"]/name')
        autores = []
        autor = {}
        sq = 1
        for a in autors:
            autor['scielo_id'] = self.scielo_id
            autor['seq'] = str(sq);
            autor['aff_id'] = 'Af'+str(sq);
            autor['surname'] = a[0].text;
            autor['given_names'] = a[1].text;
            autores.append(autor)
            sq=sq+1
            autor={}
        return autores
    
    def get_referencias(self):     
        refs=self.tree.xpath('/article/back/ref-list/ref')
        referencia = {}
        referencias = []
        for r in refs:
            referencia['scielo_id']=self.scielo_id
            referencia['ref_id'] = r.attrib['id']
            citation_type = r[0].attrib['citation-type']
            referencia['citation_type'] = citation_type
            if citation_type == 'journal':
                #citation_type = i.attrib['citation-type']
                referencia['article_title'] = r[0][1].text
                at = (r[0][1].attrib).values()
                referencia['article_lang'] = ''.join(at)
                referencia['source'] = r[0][2].text
                referencia['year'] = r[0][3].text
                referencia['volume'] = r[0][4].text
                referencia['page_range'] = r[0][5].text
                referencias.append(referencia)
                #print(referencia)
                referencia={}
            elif citation_type == 'book':
                referencia['source'] = r[0][1].text
                referencia['year'] = r[0][2].text
                referencia['publisher_loc'] = r[0][3].text
                referencia['publisher_name'] = r[0][4].text
                referencias.append(referencia)
                referencia={}
        return referencias
    
    def get_autores_ref(self):
        autores=self.tree.xpath('/article/back/ref-list/ref/nlm-citation/person-group[@person-group-type="author"]/*')
        ref_autor = {}
        ref_autores = []        
        seq = 1
        for autor in autores:
            ref_autor['scielo_id']=self.scielo_id
            ref_autor['surname'] = autor[0].text
            ref_autor['given_names'] = autor[1].text
            ref_autores.append(ref_autor)
            seq=seq+1
            ref_autor={}       
        return ref_autores        

    def get_referencia_autores(self):  
        refs=self.tree.xpath('/article/back/ref-list/ref')
        ref_autor = {}
        ref_autores = []
        for ref in refs:
            scielo_id=self.scielo_id
            ref_id=ref.attrib['id']
            citation_type = ref[0].attrib['citation-type']

            #print(ref)
            source = ref[0][1].text
            if ref[0][0] != []:
                person_group = ref[0][0].attrib['person-group-type']

                autores = ref[0][0]
                
                #print(autores)
                seq = 1
                if autores != {}:
                    for autor in autores:
                        ref_autor['scielo_id'] = scielo_id
                        ref_autor['ref_id'] = ref_id
                        ref_autor['citation_type'] = citation_type
                        ref_autor['person_group'] = person_group
                        ref_autor['seq'] = str(seq)
                        ref_autor['surname'] = autor[0].text
                        ref_autor['given_names'] = autor[1].text
                        ref_autor['source']=source
                        ref_autores.append(ref_autor)
                        seq=seq+1
                        ref_autor={}
        return ref_autores

    def get_texto(self): 
        r=self.tree.xpath('/article/body')
        texto=''
        for b in r:
            if b.text != None:
                texto=texto+b.text
        return texto

Exemplo de utilização:

url_xml = 'file:///home/neilor/SCIELO_DADOS/dados/artigos_xml/S1415-47571998000100002.xml'


xml = Scielo_XML(url_xml)
print(xml.get_texto())


    

url_xml = 'file:///home/neilor/SCIELO_DADOS/dados/artigos_xml/S0034-73292013000200005.xml'

xml = Scielo_XML(url_xml)
refs = (xml.get_referencia_autores())
print("============================")
for ref in refs:
    print(ref)
    print('-----------------------------------------')