## Configuration
_Initial steps to get the notebook ready to play nice with our repository. Do not delete this section._

Code formatting with [black](https://pypi.org/project/nb-black/).

In [1]:
%load_ext lab_black

Add our `utils` directory to the system's `$PATH` so we can import Python files from sibling directories.

In [2]:
import os
import pathlib

In [3]:
this_dir = pathlib.Path(os.path.abspath(""))

In [4]:
data_dir = this_dir / "data"

In [5]:
import re
import json
import pytz
import glob
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

## Download

Retrieve the page

In [6]:
url = "https://infogram.com/1pe66wmyjnmvkrhm66x9362kp3al60r57ex"

In [7]:
page = requests.get(url)

## Parse

In [8]:
soup = BeautifulSoup(page.content, "html.parser")

Find script tag with the Infogram data

In [9]:
for elem in soup(text=re.compile(r"window.infographicData")):
    script = elem

In [10]:
stripped_script = script.replace("window.infographicData=", "")
stripped_script = stripped_script.rstrip(";")

In [11]:
data = json.loads(stripped_script)

In [12]:
data

{'id': 154669172,
 'type': 1,
 'block_id': 'f3f1e605-d12e-4b78-b169-15ea73fb261a',
 'theme_id': 16021,
 'user_id': 57803522,
 'team_user_id': None,
 'path': '47dfbc55-bfe3-4883-88a6-db68d50848ad',
 'title': 'Butte County COVID-19',
 'description': '',
 'tags': '',
 'public': False,
 'publicAccess': False,
 'private_link_enabled': 1,
 'thumb': 'https://infogram-thumbs-200.s3-eu-west-1.amazonaws.com/f3f1e605-d12e-4b78-b169-15ea73fb261a.jpg',
 'embedImageUrl': 'https://infogram.io/p/10b3ab075a9851ccb46bbb86abd7cb54.png',
 'previewImageUrl': 'https://infogram.io/p/630924bc704d5dc6b9aefe7465f88f80.png',
 'width': 550,
 'copyright': '',
 'properties': {'tabs': True,
  'zeropaddingembed': True,
  'flexTemplateId': 'd1e4d09e-50c8-4c9f-ae0a-51a155a988e6',
  'type': 'dashboards',
  'footerSettings': {'logoType': 'custom_logo-infogram',
   'showFooter': True,
   'customLinkOption': 'text',
   'hasCustomLink': False},
  'publishType': 1,
  'transparent': False,
  'rtl': False,
  'language': 'en',


In [13]:
data["elements"]["content"]["content"]["entities"]

{'0709e3f0-07d2-4964-94c9-dd32e71c1c938166d4e1-9485-4fa8-ac4a-a9251a9a5d77': {'filters': {},
  'height': 183.3386134999372,
  'left': 2320,
  'lockAspectRatio': True,
  'locked': False,
  'maxHeight': 10000,
  'maxWidth': 10000,
  'minHeight': 1,
  'minWidth': 1,
  'props': {'assetId': '0a71f6d1-095e-441d-a6ff-30cfcfdda37c',
   'graphicType': 'uploaded_image',
   'id': 84306992,
   'opacity': 1,
   'subType': 'image'},
  'top': 0,
  'transform': {'flipHorizontal': False, 'flipVertical': False, 'rotate': 0},
  'type': 'IMAGE',
  'width': 183.3386134999372},
 '0b43bd58-411e-456d-8502-51a4ae1a330420e9c946-7b89-438b-a91f-291116777f2e': {'filters': {},
  'height': 183.3386134999372,
  'left': 2305,
  'lockAspectRatio': True,
  'locked': False,
  'maxHeight': 10000,
  'maxWidth': 10000,
  'minHeight': 1,
  'minWidth': 1,
  'props': {'assetId': '0a71f6d1-095e-441d-a6ff-30cfcfdda37c',
   'graphicType': 'uploaded_image',
   'id': 84306992,
   'opacity': 1,
   'subType': 'image'},
  'top': 11,
 

In [14]:
cities = data["elements"]["content"]["content"]["entities"][
    "b26b9acd-b036-40bc-bbbe-68667dd338e4"
]["props"]["chartData"]["data"][0]

KeyError: 'b26b9acd-b036-40bc-bbbe-68667dd338e4'

In [None]:
not_empty = [ele for ele in cities if ele != [""]]

Convert to dataframe

In [None]:
df = pd.DataFrame(not_empty, columns=["area", "confirmed_cases"])

Get timestamp

In [None]:
timestamp = data["updatedAt"]

In [None]:
latest_date = pd.to_datetime(timestamp).date()

In [None]:
df.insert(0, "county", "Butte")

In [None]:
df["county_date"] = latest_date

In [None]:
df = df[df.area != "Region"]

## Vet

In [None]:
try:
    assert not len(df) > 6
except AssertionError:
    raise AssertionError("Butte County's city scraper has additional rows")

In [None]:
try:
    assert not len(df) < 6
except AssertionError:
    raise AssertionError("Butte's city scraper is missing rows")

## Export

Mark the current date

In [None]:
tz = pytz.timezone("America/Los_Angeles")

In [None]:
today = datetime.now(tz).date()

In [None]:
slug = "butte"

In [None]:
df.to_csv(data_dir / slug / f"{today}.csv", index=False)

## Combine

In [None]:
csv_list = [
    i
    for i in glob.glob(str(data_dir / slug / "*.csv"))
    if not str(i).endswith("timeseries.csv")
]

In [None]:
df_list = []
for csv in csv_list:
    if "manual" in csv:
        df = pd.read_csv(csv, parse_dates=["date"])
    else:
        file_date = csv.split("/")[-1].replace(".csv", "")
        df = pd.read_csv(csv, parse_dates=["county_date"])
        df["date"] = file_date
    df_list.append(df)

In [None]:
df = pd.concat(df_list).sort_values(["date", "area"])

In [None]:
df.to_csv(data_dir / slug / "timeseries.csv", index=False)