In [6]:
import os
import pickle

from lxml import etree

In [2]:
input_dir = '../data/CAPP/CAPP_flat'
filenames = os.listdir(input_dir)

# Load schema

In [3]:
with open('capp.xsd', 'r') as f:
    xmlschema_doc = etree.parse(f)
    xmlschema = etree.XMLSchema(xmlschema_doc)

# Parse CAPP

In [4]:
def parse_document(filename):
    
    input_path = os.path.join(input_dir, filename)
    with open(input_path, 'r') as f:
        tree = etree.parse(f)
    
    if not xmlschema.validate(tree):
        print(etree.tostring(tree, pretty_print=True, encoding='unicode'))
        raise ValueError(xmlschema.error_log.filter_from_errors())
    
    return tree

In [5]:
trees = {}
for i, filename in enumerate(filenames):
    ident, _ = os.path.splitext(filename
                               )
    tree = parse_document(filename)
    trees[ident] = tree
    
    if i % 1000 == 0:
        print(i)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000


# Parse to python objects
NB : lxml objects cannot be pickled

In [52]:
leaf_tags = ['ID', 'ANCIEN_ID', 'ORIGINE', 'URL', 'NATURE', 'TITRE', 'DATE_DEC', 
             'JURIDICTION', 'NUMERO', 'SOLUTION', 'FORMATION', 'FORM_DEC_ATT',
             'DATE_DEC_ATT', 'SIEGE_APPEL', 'JURI_PREM', 'LIEU_PREM', 'DEMANDEUR',
             'DEFENDEUR', 'PRESIDENT', 'AVOCAT_GL', 'AVOCATS', 'RAPPORTEUR', 'ECLI'
            ]

special_tags = ['NUMEROS_AFFAIRES', 'PUBLI_BULL', 'BLOC_TEXTUEL', 'SOMMAIRE',
                'CITATION_JP', 'LIENS']

def objectify_NUMEROS_AFFAIRES(node):
    numeros_affaires = []
    for child in node:
        child_text = child.text
        numeros_affaires.append(child_text)
        
    return {'NUMEROS_AFFAIRES': numeros_affaires}

def objectify_PUBLI_BULL(node):
    publie = node.attrib['publie']

    text = node.text
    if text:
        text = text.strip()

    return {
        'PUBLI_BULL_publie': publie,
        'PUBLI_BULL_text': text,
    }

def objectify_BLOC_TEXTUEL(node):
    assert len(node) == 1
    
    contenu = etree.tostring(node[0], encoding='unicode')

    return {'BLOC_TEXTUEL': contenu}

def objectify_SOMMAIRE(node):
    sommaire = []
    for child in node:
        if child.tag == 'SCT':
            sommaire.append({
                'child_type': 'SCT',
                'ID': child.attrib['ID'],
                'TYPE': child.attrib['TYPE'],
                'text': child.text,
            })
        elif child.tag == 'ANA':
            sommaire.append({
                'child_type': 'ANA',
                'ID': child.attrib['ID'],
                'text': child.text,
            })
    return {'SOMMAIRE': sommaire}

def objectify_CITATION_JP(node):
    if len(node) == 0:
        return {}
    
    contenu = etree.tostring(node[0], encoding='utf-8')

    return {'CITATION_JP': contenu}

def objectify_LIENS(node):
    liens = []
    for child in node:
        lien = dict(node.attrib)
        lien['text'] = node.text
        liens.append(lien)
    return {'LIENS': liens}

    
def objectify(node):
    tag = node.tag
    attrib = dict(node.attrib)
    text = node.text
    if text:
        text = text.strip()
    
    if tag in leaf_tags:
        assert not attrib
        return {tag: text}
    elif tag in special_tags:
        if tag == 'NUMEROS_AFFAIRES':
            return objectify_NUMEROS_AFFAIRES(node)
        elif tag == 'PUBLI_BULL':
            return objectify_PUBLI_BULL(node)
        elif tag == 'BLOC_TEXTUEL':
            return objectify_BLOC_TEXTUEL(node)
        elif tag == 'SOMMAIRE':
            return objectify_SOMMAIRE(node)
        elif tag == 'CITATION_JP':
            return objectify_CITATION_JP(node)
        elif tag == 'LIENS':
            return objectify_LIENS(node)
        raise ValueError()
    else:

        content = {}
        for child in node:
            child_content = objectify(child)
            for k, v in child_content.items():
                assert k not in content
                content[k] = v

        return content

In [56]:
python_trees = {}
i = 0
for ident, tree in trees.items():
    root = tree.getroot()
    python_tree = objectify(root)
    python_trees[ident] = python_tree
    
    i += 1
    if i % 1000 == 0:
        print(i)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000


In [58]:
with open('../data/CAPP_obj.pickle', 'wb') as f:
    pickle.dump(python_trees, f)

In [None]:
# to load :
with open('../data/CAPP_obj.pickle', 'rb') as f:
    python_trees = pickle.load(f)