## Uso simple

In [1]:
lines = [
    "En este film velado en blanca noche",
    "El hijo tenaz de tu enemigo",
    "El muy verdugo cena distinguido",
    "Una noche de cristal que se hace añicos",
]

with open("/tmp/archivoprueba", "wt", encoding="utf8") as fh:
    for line in lines:
        fh.write(line + "\n")  # notar el agregado del salto de linea

with open("/tmp/archivoprueba", "rt", encoding="utf8") as fh:
    print(fh.read())

En este film velado en blanca noche
El hijo tenaz de tu enemigo
El muy verdugo cena distinguido
Una noche de cristal que se hace añicos



In [2]:
with open("/tmp/archivoprueba", "rt", encoding="utf8") as fh:
    for line in fh:
        print(len(line), repr(line))

36 'En este film velado en blanca noche\n'
28 'El hijo tenaz de tu enemigo\n'
32 'El muy verdugo cena distinguido\n'
40 'Una noche de cristal que se hace añicos\n'


In [3]:
with open("/tmp/archivoprueba", "rt", encoding="utf8") as fh:
    longest = max(fh, key=len)
print("Más larga:", repr(longest))

Más larga: 'Una noche de cristal que se hace añicos\n'


## Procesamiento del Maintainers

Cada linea del archivo tiene un formato poco rígido: el paquete, 
uno o más espacios, y quien(es) lo mantiene (en su gran mayoría 
una sola entidad). Se lo puede descargar de [aquí](http://ftp.uk.debian.org/debian/indices/Maintainers)

En la parte de maintainers tenemos que soportar (incluyendo un 
caso ejemplo):

- el caso más usado de nombre (con espacios) y mail:
    `Debian Games Team <pkg-games-devel@lists.alioth.debian.org>`
    
- lo mismo pero con una coma al final:
    `Daniel Baumann <daniel.baumann@progress-linux.org>,`
    
- múltiples mantenedores:
    `Russell Coker <russell@coer.com.au>, Taihsiang Ho <e@tai21828.me>`
    
- nombres con coma:
    `TransNexus, Inc. <support@transnexus.com>`
    
- sólo el mail (lo consideramos el nombre):
    `<joe@coldsystems.com>`

In [4]:
from collections import Counter

names_counter = Counter()

with open("Maintainers", "rt", encoding="utf8") as fh:
    for line in fh:
        # separamos por blancos, pero como mucho una vez para tener intacto
        # el texto de todos los mantenedores
        package, maintainers = line.strip().split(maxsplit=1)
        
        # separamos por mayor-que y coma, para no cortar por la coma entre los nombres
        for maintainer in maintainers.split(">,"):
            if "<" in maintainer:
                # sacamos el resto del mail
                name, _ = maintainer.split("<")
            else:
                # el nombre *es* el mail
                name = maintainer.replace("<", "").replace(">", "")
               
            # contamos
            name = name.strip()
            names_counter[name] += 1

for name, quantity in names_counter.most_common(5):
    print(f"{quantity:3d}  {name}")

9002  Debian Kernel Team
7392  Debian GCC Maintainers
6544  Debian Rust Maintainers
5614  Debian Python Team
5010  Debian Haskell Group


## Trabajando con CSV

In [5]:
import csv

original = [
    ["algún texto", 234, "", 'con comilla (")'],
    ["con tilde (')", 14.3, "coma y punto y coma (,;)", "con newline (\n)"],
]

with open("/tmp/prueba.csv", "wt") as fh:
    writer = csv.writer(fh)
    for row in original:
        writer.writerow(row)
        
with open("/tmp/prueba.csv", "rt") as fh:
    print(fh.read())
        

algún texto,234,,"con comilla ("")"
con tilde ('),14.3,"coma y punto y coma (,;)","con newline (
)"



In [6]:
with open("/tmp/prueba.csv", "rt") as fh:
    for line in fh:
        print(line.split(","))

['algún texto', '234', '', '"con comilla ("")"\n']
["con tilde (')", '14.3', '"coma y punto y coma (', ';)"', '"con newline (\n']
[')"\n']


In [7]:
with open("/tmp/prueba.csv", "rt") as fh:
    reader = csv.reader(fh)
    for row in reader:
        print(row)

['algún texto', '234', '', 'con comilla (")']
["con tilde (')", '14.3', 'coma y punto y coma (,;)', 'con newline (\n)']


In [8]:
with open("/tmp/prueba.csv", "wt") as fh:
    writer = csv.writer(fh, quoting=csv.QUOTE_NONNUMERIC)
    for row in original:
        writer.writerow(row)

with open("/tmp/prueba.csv", "rt") as fh:
    reader = csv.reader(fh, quoting=csv.QUOTE_NONNUMERIC)
    for row in reader:
        print(row)

['algún texto', 234.0, '', 'con comilla (")']
["con tilde (')", 14.3, 'coma y punto y coma (,;)', 'con newline (\n)']


In [9]:
with open("/tmp/prueba.csv", "rt") as fh:
    reader = csv.reader(fh, quoting=csv.QUOTE_NONNUMERIC)
    for row in reader:
        print(f"Cadena {row[3]!r} con valor {row[1]:.2f}")

Cadena 'con comilla (")' con valor 234.00
Cadena 'con newline (\n)' con valor 14.30


In [10]:
fieldnames = ["column1", "value", "column3", "text"]

with open("/tmp/prueba.csv", "rt") as fh:
    reader = csv.DictReader(fh, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
    for row in reader:
        print("Cadena {text!r} con valor {value:.2f}".format_map(row))

Cadena 'con comilla (")' con valor 234.00
Cadena 'con newline (\n)' con valor 14.30


## Trabajando con JSON

In [11]:
import json

structures = [
    "una cadena unicode áÑ",  # eso
    35,  # un entero
    2.3,  # punto flotante
    [1, "hola", [3, 2], 'comilla "'],  # lista con cosas (inclusive otra lista)
    (1, 2, 3, True, False, None),  # tupla
    {"a": 3, "b": [], 'c': ''}  # diccionario, ojo a las claves siempre string
]

for structure in structures:
    encoded = json.dumps(structure)
    print(f"Python: {structure!r} - JSON: {encoded!r}")

Python: 'una cadena unicode áÑ' - JSON: '"una cadena unicode \\u00e1\\u00d1"'
Python: 35 - JSON: '35'
Python: 2.3 - JSON: '2.3'
Python: [1, 'hola', [3, 2], 'comilla "'] - JSON: '[1, "hola", [3, 2], "comilla \\""]'
Python: (1, 2, 3, True, False, None) - JSON: '[1, 2, 3, true, false, null]'
Python: {'a': 3, 'b': [], 'c': ''} - JSON: '{"a": 3, "b": [], "c": ""}'


In [12]:
the_tuple = (1, 2, 3, True, False, None)
print("Original:", repr(the_tuple))
encoded = json.dumps(the_tuple)
print("El JSON:", repr(encoded))
decoded = json.loads(encoded)
print("Volvimos:", repr(decoded))

Original: (1, 2, 3, True, False, None)
El JSON: '[1, 2, 3, true, false, null]'
Volvimos: [1, 2, 3, True, False, None]


In [13]:
somestuff = dict(answer=42, message="hola campeón", sequence=[1, 2, 3.14])
with open("/tmp/prueba.json", "wt", encoding="utf8") as fh:
    json.dump(somestuff, fh)
    
with open("/tmp/prueba.json", "rt", encoding="utf8") as fh:
    loaded = json.load(fh)
print(loaded)

{'answer': 42, 'message': 'hola campeón', 'sequence': [1, 2, 3.14]}


In [14]:
import requests

forecast_url = "https://api.open-meteo.com/v1/forecast"

# pedimos el clima actual en las coordenadas del museo
lat, long = -34.9153435, -57.9331994
params = dict(latitude=lat, longitude=long, current_weather=True)

# para que no venga comprimido
headers = headers={"Accept-Encoding": "identity"}  

resp = requests.get(forecast_url, stream=True, headers=headers, params=params)
print(resp.headers["Content-Type"])
print(resp.raw)

application/json; charset=utf-8
<urllib3.response.HTTPResponse object at 0x7352b41ff580>


In [15]:
data = json.load(resp.raw)
weather = data["current_weather"]
weather

{'time': '2024-09-24T12:00',
 'interval': 900,
 'temperature': 16.0,
 'windspeed': 23.3,
 'winddirection': 107,
 'is_day': 1,
 'weathercode': 0}

In [16]:
import datetime

def decoder(json_dict):
    for key, value in json_dict.items():
        try:
            json_dict[key] = datetime.datetime.fromisoformat(value)
        except (ValueError, TypeError):
            pass
    return json_dict
    
resp = requests.get(forecast_url, stream=True, headers=headers, params=params)
data = json.load(resp.raw, object_hook=decoder)
weather = data["current_weather"]
weather

{'time': datetime.datetime(2024, 9, 24, 12, 0),
 'interval': 900,
 'temperature': 16.0,
 'windspeed': 23.3,
 'winddirection': 107,
 'is_day': 1,
 'weathercode': 0}

## Trabajando con XML

In [17]:
resp = requests.get("https://es.wikipedia.org/wiki/Especial:Exportar/Argentina")
fullxml = resp.text
len(fullxml)

118595

In [18]:
import textwrap

for chunk in textwrap.wrap(fullxml, width=80, max_lines=10):
    print(chunk)

<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/
http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="es">
<siteinfo>     <sitename>Wikipedia</sitename>     <dbname>eswiki</dbname>
<base>https://es.wikipedia.org/wiki/Wikipedia:Portada</base>
<generator>MediaWiki 1.43.0-wmf.23</generator>     <case>first-letter</case>
<namespaces>       <namespace key="-2" case="first-letter">Medio</namespace>
<namespace key="-1" case="first-letter">Especial</namespace>       <namespace
key="0" case="first-letter" />       <namespace key="1" case="first- [...]


In [19]:
from xml.etree import ElementTree

node = ElementTree.fromstring(fullxml)
print("El primer nodo:", node)
print("Su tag:", node.tag)
print("Atributos:")
for key, value in node.attrib.items():
    print(f"    {key!r}: {value!r}")
print("Texto útil no tiene:", repr(node.text))
print("Pero sí nodos hijos:")
for child in node:
    print("    ", child)

El primer nodo: <Element '{http://www.mediawiki.org/xml/export-0.11/}mediawiki' at 0x7352af5dd210>
Su tag: {http://www.mediawiki.org/xml/export-0.11/}mediawiki
Atributos:
    '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd'
    'version': '0.11'
    '{http://www.w3.org/XML/1998/namespace}lang': 'es'
Texto útil no tiene: '\n  '
Pero sí nodos hijos:
     <Element '{http://www.mediawiki.org/xml/export-0.11/}siteinfo' at 0x7352af5df380>
     <Element '{http://www.mediawiki.org/xml/export-0.11/}page' at 0x7352af5b9850>


In [20]:
page = node.find("{http://www.mediawiki.org/xml/export-0.11/}page")
title = page.find("{http://www.mediawiki.org/xml/export-0.11/}title")
revision = page.find("{http://www.mediawiki.org/xml/export-0.11/}revision")
rev_id = revision.find("{http://www.mediawiki.org/xml/export-0.11/}id")
rev_tstamp = revision.find("{http://www.mediawiki.org/xml/export-0.11/}timestamp")
rev_text = revision.find("{http://www.mediawiki.org/xml/export-0.11/}text")
textlen = rev_text.attrib["bytes"]
print(f"Página {title.text!r} r.{rev_id.text} ({rev_tstamp.text}) largo={textlen}")

Página 'Argentina' r.162365205 (2024-09-09T23:59:56Z) largo=115597


In [21]:
from xml.dom.minidom import parseString

dom = parseString(fullxml)
node = dom.firstChild

print("El primer nodo:", node)
print("Su tag:", node.nodeName)
print("Atributos:")
for key, value in node.attributes.items():
    print(f"    {key!r}: {value!r}")
print('Los nodos "hijos":')
for child in node.childNodes:
    print("    ", child)

El primer nodo: <DOM Element: mediawiki at 0x7352b425fad0>
Su tag: mediawiki
Atributos:
    'xmlns': 'http://www.mediawiki.org/xml/export-0.11/'
    'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance'
    'xsi:schemaLocation': 'http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd'
    'version': '0.11'
    'xml:lang': 'es'
Los nodos "hijos":
     <DOM Text node "'\n  '">
     <DOM Element: siteinfo at 0x7352b425f770>
     <DOM Text node "'\n  '">
     <DOM Element: page at 0x7352af5d7020>
     <DOM Text node "'\n'">


In [22]:
page = dom.getElementsByTagName("page")[0]
title = page.getElementsByTagName("title")[0]
revision = page.getElementsByTagName("revision")[0]
rev_id = revision.getElementsByTagName("id")[0]
rev_tstamp = revision.getElementsByTagName("timestamp")[0]
rev_text = revision.getElementsByTagName("text")[0]
textlen = rev_text.attributes["bytes"].firstChild.data

title_text = title.firstChild.data
rev_id_text = rev_id.firstChild.data
rev_tstamp_text = rev_tstamp.firstChild.data

print(f"Página {title_text!r} r.{rev_id_text} ({rev_tstamp_text}) largo={textlen}")

Página 'Argentina' r.162365205 (2024-09-09T23:59:56Z) largo=115597


In [23]:
import xml.sax

class Handler(xml.sax.handler.ContentHandler):

    def __init__(self):
        self.for_text = {
            ("mediawiki", "page", "title"): "title",
            ("mediawiki", "page", "revision", "id"): "rev_id",
            ("mediawiki", "page", "revision", "timestamp"): "rev_tstamp",
        }
        self.for_attribs = {
            ("mediawiki", "page", "revision", "text"): "text_attr",
        }
        self.data = {}
        self.branch = []

    def startElement(self, name, attrs):
        self.branch.append(name)
        branch = tuple(self.branch)
        if branch in self.for_attribs:
            key = self.for_attribs[branch]
            self.data[key] = dict(attrs)

    def endElement(self, name):
        assert self.branch[-1] == name
        self.branch.pop()

    def characters(self, content):
        branch = tuple(self.branch)
        if branch in self.for_text:
            key = self.for_text[branch]
            self.data[key] = self.data.get(key, "") + content
            
handler = Handler()
xml.sax.parseString(fullxml, handler)
print("Página {title!r} r.{rev_id} ({rev_tstamp}) largo={text_attr[bytes]}".format_map(handler.data))

Página 'Argentina' r.162365205 (2024-09-09T23:59:56Z) largo=115597



### Copyright 2020-2024 Facundo Batista y Manuel Carlevaro

Licencia CC BY-NC-SA 4.0

Para más info visitar: https://github.com/facundobatista/libro-pyciencia/

