In [1]:
import os
import polars as pl
from tqdm import tqdm
import xml.etree.ElementTree as et

In [2]:
def obtain_xml_files(target_directory):
    xml_files = []
    for root, _, files in os.walk(target_directory):
        for file in files:
            if file.endswith('.xml'):
                xml_files.append(os.path.join(root, file))
    return xml_files

In [3]:
def extract_type_info(xml_files, namespace='http://www.tei-c.org/ns/1.0'):
    et.register_namespace('', namespace)
    tag = lambda tag_name: '{' + namespace + '}' + tag_name
    data = {'title_id':[], 'title_content':[], 'body':[], 'domain':[], 'source_type':[], 'carolina_type':[]}
    for xml_file in tqdm(xml_files):
        tree = et.parse(xml_file)
        root = tree.getroot()
        documents = tree.findall(tag('TEI'))
        doc_count = 0
        for document in documents:
            header = document.findall(tag('teiHeader'))[0]
            fileDesc = header.findall(tag('fileDesc'))[0]
            titleStmt = fileDesc.findall(tag('titleStmt'))[0]
            title_id = titleStmt.findall(tag('title'))[0].text
            
            text = document.findall(tag('text'))[0]
            body = text.findall(tag('body'))[0]
            paragraphs =  body.findall(tag('p'))
            try:
                content ="\t" + "\n\t".join([p.text for p in paragraphs])
            except TypeError as err:
                content ="\t" + "\n\t".join([p.text for p in paragraphs if p.text != None])
            
            sourceDesc = fileDesc.findall(tag('sourceDesc'))[0]
            biblFull = sourceDesc.findall(tag('biblFull'))[0]
            profileDesc = biblFull.findall(tag('profileDesc'))[0]
            textDesc = profileDesc.findall(tag('textDesc'))[0]
            domain = textDesc.findall(tag('domain'))[0].text
            
    
            fileDesc = biblFull.findall(tag('fileDesc'))[0]
            titleStmt = fileDesc.findall(tag('titleStmt'))[0]
            title = titleStmt.findall(tag('title'))[0]
            title_content = title.findall(tag('name'))[0].text
            
    
            textClass = profileDesc.findall(tag('textClass'))[0]
            catRef = textClass.findall(tag('catRef'))[0]
            if catRef.attrib['scheme'] != "#Source_typology":
                raise ValueError("Typological Scheme Error: expected #Source_typology, found " + catRef.attrib['scheme'])
            source_type = catRef.attrib['target']
            
    
            profileDesc = header.findall(tag('profileDesc'))[0]
            textClass = profileDesc.findall(tag('textClass'))[0]
            catRef = textClass.findall(tag('catRef'))[0]
            if catRef.attrib['scheme'] != "#Carolina_typology":
                raise ValueError("Typological Scheme Error: expected #Carolina_typology, found " + catRef.attrib['scheme'])
            carolina_type = catRef.attrib['target']
            
            
            data['body'].append(content)
            data['title_id'].append(title_id)
            data['title_content'].append(title_content)
            data['source_type'].append(source_type)
            data['carolina_type'].append(carolina_type)
            data['domain'].append(domain)
                
                
            
            doc_count += 1
    df = pl.DataFrame(data)
    return df
    

In [4]:
xml_files = obtain_xml_files('Carolina_balanceado')
type_df = extract_type_info(xml_files)

100%|█████████████████████████████████████████████████████████| 396/396 [04:28<00:00,  1.48it/s]


In [7]:
type_df.sample(10)

title_id,title_content,body,domain,source_type,carolina_type
str,str,str,str,str,str
"""WIK000958146gk""","""Leila Barros é escolhida procu…","""	Em sessão remota nesta terça-…","""Journalistic""","""#NEWS_JOU_W""","""#WIKIS"""
"""DAT000503374cw""",,"""	@passozx poxa :(""","""Virtual""","""#TWEET_VIR_W""","""#DATASETS_AND_OTHER_CORPORA"""
"""DAT000951987ei""",,"""	bom dia, bebê! como você está…","""Virtual""","""#TWEET_VIR_W""","""#DATASETS_AND_OTHER_CORPORA"""
"""DAT000326787aw""","""Cresce discriminação contra mu…","""	Recém-saída da faculdade, Ang…","""Journalistic""","""#NEWS_JOU_W""","""#DATASETS_AND_OTHER_CORPORA"""
"""DAT000483843cv""",,"""	O Nani já não dá mortais? :(""","""Virtual""","""#TWEET_VIR_W""","""#DATASETS_AND_OTHER_CORPORA"""
"""WIK000146593am""","""Agostinho da Silva""","""	George Agostinho Baptista da …","""Instructional""","""#VOCABULARY_ENTRY_INS_W""","""#WIKIS"""
"""WIK000227187bd""","""Clássico Matuto""","""	O Clássico Mestre Vitalino ou…","""Instructional""","""#VOCABULARY_ENTRY_INS_W""","""#WIKIS"""
"""DAT000500658cw""",,"""	Já não existe :( https://t.co…","""Virtual""","""#TWEET_VIR_W""","""#DATASETS_AND_OTHER_CORPORA"""
"""WIK000438212cr""","""Discussão:A Torre Negra""","""	Category:!Artigos com qualida…","""Virtual""","""#DISCUSSION_VIR_W""","""#WIKIS"""
"""DAT000743457dr""",,"""	@flossombrazil O fandom tá cr…","""Virtual""","""#TWEET_VIR_W""","""#DATASETS_AND_OTHER_CORPORA"""


In [8]:
type_df.shape

(1111829, 6)

In [9]:
type_df.write_parquet('carolina_balanced_typologies.parquet')