# Visualizando los datos de los reportes diarios
> Como podemos hacer para aprovechar los datos de los informes diarios

- toc: false 
- badges: true
- comments: false
- categories: [graficos]
- image: images/evolucion-por-provincia.png

En este post evaluamos rapidamente como se puede aprovechar la informacion que se encuentra disponible en el sitio https://www.argentina.gob.ar/coronavirus/informe-diario para tener una vision general de lo que pasa en el pais

In [61]:
#hide
headers = {'authority': 'www.argentina.gob.ar',
 'cache-control': 'max-age=0',
 'dnt': '1',
 'upgrade-insecure-requests': '1',
 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
 'sec-fetch-dest': 'document',
 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 'sec-fetch-site': 'none',
 'sec-fetch-mode': 'navigate',
 'sec-fetch-user': '?1',
 'accept-language': 'en-US,en;q=0.9,es-AR;q=0.8,es;q=0.7'}

In [62]:
#hide
import requests
content = requests.get('https://www.argentina.gob.ar/coronavirus/informe-diario', headers=headers).content

In [63]:
#hide
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')


In [104]:
#hide
pdfs = []
for a in soup.find_all('a'):
    href = a.attrs.get('href', '')
    if 'facebook' in href: continue
    if 'linkedin' in href: continue
    if 'whatsapp' in href: continue
    if 'matutino' in href: continue
        
    if 'diario' in href:
        pdfs.append(href)


In [110]:
#hide
from pathlib import Path

def get_pdf(link):
    cache_path = Path('cache')
    cache_path.mkdir(exist_ok=True, parents=True)
    cache_fname = cache_path / link.split('/')[-1]
    if not cache_fname.exists():
        pdf_content = requests.get(link, headers=headers).content
        with cache_fname.open('wb') as f: f.write(pdf_content)
    return cache_fname

In [154]:
#hide

from datetime import datetime
date_pat = re.compile('(\d+-\d+-\d+)')

def extract_date(link):
    last = link.split('/')[-1]
    date_string = date_pat.search(last).group(0)
    day, month, year = map(int, date_string.split('-'))
    if year == 20: year = 2020
    return datetime(year, month, day)

In [462]:
#hide
from collections import Counter

provinces = [
    'Ciudad Autonoma de Buenos Aires', 'Provincia de Buenos Aires', 'Catamarca', 'Chaco', 'Chubut', 'Córdoba', 'Corrientes', 'Entre Ríos',
    'Formosa', 'Jujuy', 'La Pampa', 'La Rioja', 'Mendoza', 'Misiones', 'Neuquén', 'Río Negro',
    'Salta', 'San Juan', 'San Luis', 'Santa Cruz', 'Santa Fe', 'Santiago del Estero', 'Tierra del Fuego','Tucumán'
]

def get_vec(s):
    return Counter(s)

def sim(query_s, target_s):
    query_v = get_vec(query_s.lower())
    target_v = get_vec(target_s.lower())
    
    res = 0
    for char, cnt in query_v.items():
        res += min(cnt, target_v.get(char, 0))
    return res / max(len(target_s), len(query_s))
    
def infer_province(txt):
    if 'buenosaires' in txt.lower().replace(' ', ''):
        if 'ciudad' in txt.lower(): return 'Ciudad autonoma de Buenos Aires', 1
        else: return 'Provincia de Buenos Aires', 1
                      
    scores = {}
    txt = unidecode(txt.lower())
    for p in provinces:
        scores[p] = sim(txt, unidecode(p.lower()))
    
    p, score = max(scores.items(), key=lambda x: x[1])
    if p == 'Buenos Aires': p = 'Provincia de Buenos Aires'
    return p, score
    

In [479]:
#hide
from unidecode import unidecode
import re
import PyPDF2 

pat = re.compile('(?P<num>\d+)(?P<middle>( *[a-z]{,3}){,2} *)(?P<place>[A-Z]\w+(\s\w+)*)')
pat2 = re.compile('\((?P<num>\d+)\)(?P<middle>( *[a-z]{,3}){,5} *)(?P<place>[A-Z]\w+(\s\w+)*)')

docs = []

for pdf in pdfs:
    cached_fname = get_pdf(pdf)
#     print(cached_fname)
    
    pdfReader = PyPDF2.PdfFileReader(cached_fname.open('rb'))
    page = pdfReader.getPage(0)
    txt = page.extractText().replace('personas', '').replace('\n', ' ')
#     print(txt)
    matches = list(pat.finditer(txt)) + list(pat2.finditer(txt))

    date = extract_date(pdf)
    for e in matches:
        gd = e.groupdict()
        
        if 'argentina' in gd['place'].lower(): continue 
        if 'covid' in gd['place'].lower(): continue
        if 'informe' in gd['place'].lower(): continue
        
        gd['infered_place'], gd['infered_place_score'] = infer_province(gd['place'])
        
        gd['infected'] = int(gd.pop('num'))
        gd.pop('middle')
        gd['date'] = date
        docs.append(gd)

In [480]:
#hide
import pandas as pd

df = pd.DataFrame(docs).sort_values('date')

In [497]:
#hide
dfs = []

cnt_by_place = df.groupby('infered_place').infected.sum().sort_values(ascending=False)
for place in cnt_by_place.index:
    p_df = df[df.infered_place==place].copy()
    d0 = p_df.date.min()
    p_df['days_from_first_infection'] = (p_df.date - d0).apply(lambda x: x.days)
    p_df['cum_infected'] = p_df['infected'].cumsum()
    dfs.append(p_df)
    
df = pd.concat(dfs)

In [504]:
#hide
import altair as alt

def plot_evolution(selected_places=None, xaxis='date', xaxis_title='Fecha'):
    selection = alt.selection_multi(
        fields=['infered_place'], bind='legend', init=[{'infered_place': c} for c in (selected_places or [])]
    )
    
    return (
        alt.Chart(df)
           .mark_line(point=True, radius=150)
           .encode(
               x=alt.X(xaxis, axis=alt.Axis(title=xaxis_title)),
               y=alt.Y('cum_infected', scale=alt.Scale(type='log'), axis=alt.Axis(title='Casos confirmados')),
               color=alt.Color('infered_place', sort=list(cnt_by_place.index)),
               tooltip=list(df.columns), 
               opacity=alt.condition(selection, alt.value(.8), alt.value(.05))
            ).add_selection(selection)
             .configure_point(size=200)
             .properties(width=650, height=400)
             .configure_title(align='center', )
    )
    

# Evolucion en el tiempo de la cantidad de casos totales

Haciendo Shift+Click en las provincias que se ven a la derecha, podemos agregar o quitar provincias al gráfico

In [505]:
#hide_input
plot_evolution(cnt_by_place[cnt_by_place >=10].index.tolist())#, xaxis='days_from_first_infection')

# Graficando las mismas curvas cambiando el eje X

En lugar de utilizar la fecha, utilizamos cantidad de dias desde el primer caso confirmado

In [508]:
#hide_input
plot_evolution(
    cnt_by_place[cnt_by_place >=10].index.tolist(), 
    xaxis='days_from_first_infection', 
    xaxis_title='Dias desde el primer caso confirmado'
)