In [1]:
import os
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as et

In [2]:
def obtain_xml_files(target_directory):
    xml_files = []
    for root, _, files in os.walk(target_directory):
        for file in files:
            if file.endswith('.xml'):
                xml_files.append(os.path.join(root, file))
    return xml_files

In [3]:
def extract_type_info(xml_files, namespace='http://www.tei-c.org/ns/1.0'):
    et.register_namespace('', namespace)
    tag = lambda tag_name: '{' + namespace + '}' + tag_name
    data = {'title_id':[], 'title_content':[], 'body':[], 'domain':[], 'source_type':[], 'carolina_type':[]}
    for xml_file in tqdm(xml_files):
        tree = et.parse(xml_file)
        root = tree.getroot()
        documents = tree.findall(tag('TEI'))
        doc_count = 0
        for document in documents:
            header = document.findall(tag('teiHeader'))[0]
            fileDesc = header.findall(tag('fileDesc'))[0]
            titleStmt = fileDesc.findall(tag('titleStmt'))[0]
            title_id = titleStmt.findall(tag('title'))[0].text
            
            text = document.findall(tag('text'))[0]
            body = text.findall(tag('body'))[0]
            paragraphs =  body.findall(tag('p'))
            try:
                content ="\t" + "\n\t".join([p.text for p in paragraphs])
            except TypeError as err:
                content ="\t" + "\n\t".join([p.text for p in paragraphs if p.text != None])
                print('!@', title_id)
            
            sourceDesc = fileDesc.findall(tag('sourceDesc'))[0]
            biblFull = sourceDesc.findall(tag('biblFull'))[0]
            profileDesc = biblFull.findall(tag('profileDesc'))[0]
            textDesc = profileDesc.findall(tag('textDesc'))[0]
            domain = textDesc.findall(tag('domain'))[0].text
            
    
            fileDesc = biblFull.findall(tag('fileDesc'))[0]
            titleStmt = fileDesc.findall(tag('titleStmt'))[0]
            title = titleStmt.findall(tag('title'))[0]
            title_content = title.findall(tag('name'))[0].text
            
    
            textClass = profileDesc.findall(tag('textClass'))[0]
            catRef = textClass.findall(tag('catRef'))[0]
            if catRef.attrib['scheme'] != "#Source_typology":
                raise ValueError("Typological Scheme Error: expected #Source_typology, found " + catRef.attrib['scheme'])
            source_type = catRef.attrib['target']
            
    
            profileDesc = header.findall(tag('profileDesc'))[0]
            textClass = profileDesc.findall(tag('textClass'))[0]
            catRef = textClass.findall(tag('catRef'))[0]
            if catRef.attrib['scheme'] != "#Carolina_typology":
                raise ValueError("Typological Scheme Error: expected #Carolina_typology, found " + catRef.attrib['scheme'])
            carolina_type = catRef.attrib['target']
            
            
            data['body'].append(content)
            data['title_id'].append(title_id)
            data['title_content'].append(title_content)
            data['source_type'].append(source_type)
            data['carolina_type'].append(carolina_type)
            data['domain'].append(domain)
                
                
            
            doc_count += 1
    df = pd.DataFrame(data)
    return df
    

In [4]:
xml_files = obtain_xml_files('Carolina_balanceado')
type_df = extract_type_info(xml_files)

100%|████████████████████████████████████████████████████████████████████████████████████████| 397/397 [22:12<00:00,  3.36s/it]


In [5]:
type_df.head(10)

Unnamed: 0,title_id,title_content,body,domain,source_type,carolina_type
0,WIK000865919fs,Usuário(a):Cesarsbraz,"\tSou Cesar Braz, nascido em São Bernardo do C...",Virtual,#USER_PAGE_VIR_W,#WIKIS
1,WIK000865927fs,Programas-Quadro para pesquisa e desenvolvimen...,\tOs Programas-Quadro de Pesquisa e Desenvolvi...,Instructional,#VOCABULARY_ENTRY_INS_W,#WIKIS
2,WIK000865931fs,Usuário(a):Tartarugaboladona/Testes,\tCostumes: eles costumam fazer cavalhadas e a...,Virtual,#USER_PAGE_VIR_W,#WIKIS
3,WIK000865940fs,Usuário(a):SirEdimon/Testes/2,"\tMirian Silva da Paixão (Riacho de Santana, 2...",Virtual,#USER_PAGE_VIR_W,#WIKIS
4,WIK000865943fs,Usuário(a):Abelcardoso,"\tAbel Marques de Vasconcelos Cardoso, nasceu ...",Virtual,#USER_PAGE_VIR_W,#WIKIS
5,WIK000865944fs,Usuário(a):Jpoa/Testes,"\tArmando Tavares de Sousa (Belém (Lisboa), 19...",Virtual,#USER_PAGE_VIR_W,#WIKIS
6,WIK000865947fs,Antena 1 Vitória,\tAntena 1 Vitória é uma emissora de rádio bra...,Instructional,#VOCABULARY_ENTRY_INS_W,#WIKIS
7,WIK000865949fs,Salmito,"\tJoão Salmito Filho (Fortaleza, 15 de outubro...",Instructional,#VOCABULARY_ENTRY_INS_W,#WIKIS
8,WIK000865950fs,Euroscola,"\tO Concurso Euroscola, criado em 1990, é um p...",Instructional,#VOCABULARY_ENTRY_INS_W,#WIKIS
9,WIK000865952fs,Censo da Índia de 2011,"\tTrabalho em andamento, West Sikkim O 15º Cen...",Instructional,#VOCABULARY_ENTRY_INS_W,#WIKIS


In [6]:
type_df.shape

(1113556, 6)

In [7]:
type_df.to_csv('carolina_balanced_typologies.csv', index=False)