Extraemos: 

```python
{'file': xml, 'date': fecha publicacion, 'item_ids': [lista con los ids]}
```

El nombre del pdf es: id.pdf

In [1]:
import os
import xml.etree.ElementTree as ET
from datetime import datetime
import json

In [9]:
def parse_xml_to_dict(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Extraer la fecha del nombre del archivo
    file_name = os.path.basename(file_path)
    date_str = file_name.split("-")[1].split(".")[0]
    date = datetime.strptime(date_str, "%Y%m%d").strftime("%d/%m/%Y")
    
    result = {
        "file": file_name,
        "date": date,
        "departamentos": []
    }
    
    # Navegar por la estructura XML
    for diario in root.findall('diario'):
        for seccion in diario.findall('seccion'):
            for departamento in seccion.findall('departamento'):
                dep_name = departamento.get('nombre', 'Nombre no encontrado')
                items = []
                
                # Navegar por los epigrafes dentro del departamento
                for epigrafe in departamento.findall('epigrafe'):
                    for item in epigrafe.findall('item'):
                        item_id = item.get('id', 'ID no encontrado')
                        url_pdf = item.find('urlPdf').text if item.find('urlPdf') is not None else "URL no encontrada"
                        items.append([item_id, url_pdf])
                
                result["departamentos"].append({
                    "departamento": dep_name,
                    "items": items
                })
    
    return result

In [10]:
def process_xml_folder(folder_path):
    results = []
    
    # Iterar sobre todos los archivos en la carpeta
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.xml'):
            file_path = os.path.join(folder_path, file_name)
            result = parse_xml_to_dict(file_path)
            results.append(result)
    
    return results

In [11]:
folder_path = 'xmls2023'

# Procesar la carpeta y obtener la lista de diccionarios
result_list = process_xml_folder(folder_path)

# Convertir la lista a JSON para visualizarla mejor
json_result_list = json.dumps(result_list, indent=4, ensure_ascii=False)
print(json_result_list)


[
    {
        "file": "sumario-20230809.xml",
        "date": "09/08/2023",
        "departamentos": [
            {
                "departamento": "MINISTERIO DE ASUNTOS EXTERIORES, UNIÓN EUROPEA Y COOPERACIÓN",
                "items": [
                    [
                        "BOE-A-2023-18210",
                        "/boe/dias/2023/08/09/pdfs/BOE-A-2023-18210.pdf"
                    ]
                ]
            },
            {
                "departamento": "MINISTERIO PARA LA TRANSICIÓN ECOLÓGICA Y EL RETO DEMOGRÁFICO",
                "items": [
                    [
                        "BOE-A-2023-18211",
                        "/boe/dias/2023/08/09/pdfs/BOE-A-2023-18211.pdf"
                    ]
                ]
            },
            {
                "departamento": "MINISTERIO DE HACIENDA Y FUNCIÓN PÚBLICA",
                "items": [
                    [
                        "BOE-A-2023-18212",
                        "/boe/dias/2023/08/09/pd

In [4]:
# Save the data to a file
import json
with open('xml_id_fecha.json', 'w') as f:
    json.dump(data, f, indent=4)

In [10]:
import json
# Read the data from the file
with open('xml_id_fecha.json', 'r') as f:
    data = json.load(f)

In [11]:
# Subset of xml_id_fecha.json withing a date range
from datetime import datetime
date_format = '%d/%m/%Y' #"02/10/2012"
start_date = datetime.strptime('01/08/2019', date_format)
end_date = datetime.strptime('31/08/2019', date_format)

# Some values in xml_id_fecha.json are "Fecha no encontrada" so we need to filter them out
filtered_data = [d for d in data if d['date'] != "Fecha no encontrada"]
filtered_data = [d for d in filtered_data if start_date <= datetime.strptime(d['date'], date_format) <= end_date]

In [12]:
# Save the filtered data to a file
with open('xml_id_fecha_filtered_08-2019.json', 'w') as f:
    json.dump(filtered_data, f, indent=4)

In [3]:
# Define the course data with necessary corrections for easier manipulation
courses_data = [
    {"name": "Linear Algebra and Geometry I", "credits": 12, "grade": 6.30},
    {"name": "Infinitesimal Calculation", "credits": 12, "grade": 5.30},
    {"name": "Informatics (Computer Science)", "credits": 12, "grade": 5.50},
    {"name": "Basic Algebra", "credits": 6, "grade": 7.00},
    {"name": "Descriptive Statistics", "credits": 6, "grade": 6.50},
    {"name": "Physics I", "credits": 6, "grade": 6.50},
    {"name": "Numerical Calculus I", "credits": 6, "grade": 5.50},
    {"name": "Discrete Mathematics", "credits": 6, "grade": 5.00},
    {"name": "Optional Credits for Mobility Students I", "credits": 6, "grade": 7.20},
    {"name": "Optional Credits for Mobility Students II", "credits": 6, "grade": 5.50},
    {"name": "Optional Credits for Mobility Students III", "credits": 6, "grade": 5.00},
    {"name": "Optional Credits for Mobility Students IV", "credits": 6, "grade": 7.80},
    {"name": "Topology", "credits": 6, "grade": 7.50},
    {"name": "Economics", "credits": 12, "grade": 9.00},
    {"name": "Linear Algebra and Geometry II", "credits": 6, "grade": 6.20},
    {"name": "Differentiation of Functions of Several Variables", "credits": 6, "grade": 8.90},
    {"name": "Elements of Probability and Statistics", "credits": 6, "grade": 6.00},
    {"name": "Series of Functions and Integral of Lebesgue", "credits": 6, "grade": 8.40},
    {"name": "Numerical Calculus II", "credits": 6, "grade": 5.60},
    {"name": "Ordinary Differential Equations", "credits": 6, "grade": 6.70},
    {"name": "Physics II", "credits": 6, "grade": 7.50},
    {"name": "Integration of Functions of Several Variables", "credits": 6, "grade": 5.00},
    {"name": "Probability Theory", "credits": 6, "grade": 5.00},
    {"name": "Statistical Sampling", "credits": 12, "grade": 5.70},
    {"name": "Extension of Differential Equations", "credits": 6, "grade": 5.30},
    {"name": "Algebraic Structures", "credits": 6, "grade": 5.50},
    {"name": "Functions of a Complex Variable", "credits": 6, "grade": 6.00},
    {"name": "Local Geometry of Curves and Surfaces", "credits": 6, "grade": 5.20},
    {"name": "Statistical Inference", "credits": 6, "grade": 7.00},
    {"name": "Demography", "credits": 6, "grade": 7.60},
    {"name": "Geometry and Topology of Surfaces", "credits": 6, "grade": 6.40},
    {"name": "Mathematical Modeling", "credits": 6, "grade": 6.80},
    {"name": "Linear Models and Design of Experiments", "credits": 6, "grade": 6.70},
    {"name": "Mathematical Programming", "credits": 6, "grade": 5.30},
    {"name": "Introduction to Databases", "credits": 12, "grade": 5.40},
    {"name": "Statistical Inference II", "credits": 6, "grade": 6.50},
    {"name": "Survey Design", "credits": 6, "grade": 7.10},
    {"name": "Operations Research Models", "credits": 6, "grade": 5.00},
    {"name": "Operations Research II", "credits": 6, "grade": 6.50},
    {"name": "Multivariate Data Analysis", "credits": 6, "grade": 5.90},
    {"name": "Computational Statistics I", "credits": 6, "grade": 6.20},
    {"name": "Official Statistics", "credits": 6, "grade": 6.50},
    {"name": "Coding Theory and Cryptography", "credits": 6, "grade": 8.30},
    {"name": "Applied Geometry", "credits": 6, "grade": 5.90},
    {"name": "Design of Experiments", "credits": 6, "grade": 5.30},
    {"name": "Computational Statistics II", "credits": 6, "grade": 5.70},
    {"name": "Geometry of the Statistical Methods", "credits": 6, "grade": 9.20},
    {"name": "Time Series", "credits": 6, "grade": 6.50},
]

# Sorting the courses by grade in descending order
sorted_courses = sorted(courses_data, key=lambda x: x["grade"], reverse=True)

# Initialize variables to keep track of total credits and weighted grades
total_credits = 0
total_weighted_grades = 0
selected_courses = []

# Select courses until the total credits reach 120 ECTS
for course in sorted_courses:
    if total_credits + course["credits"] <= 228:
        selected_courses.append(course)
        total_credits += course["credits"]
        total_weighted_grades += course["credits"] * course["grade"]

# Calculate the weighted average grade
weighted_average_grade = total_weighted_grades / total_credits
selected_courses, weighted_average_grade


([{'name': 'Geometry of the Statistical Methods', 'credits': 6, 'grade': 9.2},
  {'name': 'Economics', 'credits': 12, 'grade': 9.0},
  {'name': 'Differentiation of Functions of Several Variables',
   'credits': 6,
   'grade': 8.9},
  {'name': 'Series of Functions and Integral of Lebesgue',
   'credits': 6,
   'grade': 8.4},
  {'name': 'Coding Theory and Cryptography', 'credits': 6, 'grade': 8.3},
  {'name': 'Optional Credits for Mobility Students IV',
   'credits': 6,
   'grade': 7.8},
  {'name': 'Demography', 'credits': 6, 'grade': 7.6},
  {'name': 'Topology', 'credits': 6, 'grade': 7.5},
  {'name': 'Physics II', 'credits': 6, 'grade': 7.5},
  {'name': 'Optional Credits for Mobility Students I',
   'credits': 6,
   'grade': 7.2},
  {'name': 'Survey Design', 'credits': 6, 'grade': 7.1},
  {'name': 'Basic Algebra', 'credits': 6, 'grade': 7.0},
  {'name': 'Statistical Inference', 'credits': 6, 'grade': 7.0},
  {'name': 'Mathematical Modeling', 'credits': 6, 'grade': 6.8},
  {'name': 'Ord

# Incluimos departamentos

It doesn´t work

In [2]:
import os
import xml.etree.ElementTree as ET

def extract_data_from_xml(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith('.xml'):
            path = os.path.join(directory, filename)
            tree = ET.parse(path)
            root = tree.getroot()
            
            # Extraer la fecha de publicación
            date = root.find('.//fecha').text if root.find('.//fecha') is not None else "date not found"
            
            # Extraer todos los items que tienen ids, url_pdf de los elementos <departamento>. dentro de <departamento> puede haber varios <item>
            departamentos_data = []
            for departamento in root.findall('.//departamento'):
                departamento_name = departamento.get('nombre')
                items_data = []
                for item in departamento.findall('.//item'):
                    item_id = item.get('id')
                    url_pdf = item.find('urlPdf').text if item.find('urlPdf') is not None else "URL not found"
                    items_data.append((item_id, url_pdf))
                departamentos_data.append({'departamento': departamento_name, 'items': items_data})
    
            data.append({'file': filename, 'date': date, 'departamentos': departamentos_data})
    
    return data

data = extract_data_from_xml(directory='xmls')

# Save the data to a file
import json
with open('xml_departamento_id_fecha.json', 'w') as f:
    json.dump(data, f, indent=4)