# Intake data-sets from scratch

In [1]:
%load_ext lab_black

In [2]:
import intake
import dask
import pandas as pd

## Open the data sources

`intake.open_csv` reads CSV files into dataframes

Parameters are:
```
urlpath : str or iterable, location of data
csv_kwargs : dict
storage_options : dict
path_as_pattern : bool or str, optional        
```

In [4]:
src = intake.open_csv(
    urlpath="https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e",
    csv_kwargs={
        "sep": ",",
        "blocksize": None,
        "encoding": "iso-8859-1",
        "dtype": {
            "departement": str,
            "jour": pd.StringDtype,
            "pop": pd.StringDtype,
            "P": pd.StringDtype,
            "cl_age90": pd.StringDtype,
        },
    },
)

## Feed a `Catalog`

A `Catalog` instance is an object with one or more named entries.The entries might be read:
- from a static file (e.g., YAML),
- from an Intake server
- from any other data service that has a driver. Those are ordinary DataSource classes, except that they have the container type “catalog”, and do not return data products via the read() method.

In [5]:
cat = intake.open_catalog(name="mon catalogue")

In [6]:
cat

mon catalogue:
  args:
    args: null
    name: mon catalogue
  description: ''
  driver: intake.catalog.base.Catalog
  metadata: {}


`intake.Schema` holds details of data description for any type of data-source

In [34]:
from intake.catalog import Catalog
from intake.catalog.local import LocalCatalogEntry

mycat = Catalog.from_dict(
    {
        "source1": LocalCatalogEntry(
            "taux",
            description="Taux de positivité - quotidien - département",
            driver="csv",
            args={
                "urlpath": "https://www.data.gouv.fr/fr/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675",
                "csv_kwargs": {
                    "sep": ";",
                    "blocksize": None,
                    "encoding": "iso-8859-1",
                    "dtype": {
                        "dep": str,
                    },
                },
            },
        )
    }
)

In [35]:
print(mycat["source1"].yaml())

sources:
  taux:
    args:
      csv_kwargs:
        blocksize: null
        dtype:
          dep: !!python/name:builtins.str ''
        encoding: iso-8859-1
        sep: ;
      urlpath: https://www.data.gouv.fr/fr/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675
    description: "Taux de positivit\xE9 - quotidien - d\xE9partement"
    driver: intake.source.csv.CSVSource
    metadata:
      catalog_dir: ''



In [28]:
mycat.source1.read().head()

Unnamed: 0,dep,jour,P,T,cl_age90
0,1,2020-05-13,0,16,9
1,1,2020-05-13,1,17,19
2,1,2020-05-13,0,33,29
3,1,2020-05-13,1,72,39
4,1,2020-05-13,0,54,49


In [73]:
acat = intake.open_catalog(name=" catalogue")

In [123]:
print(src.yaml())

sources:
  csv:
    args:
      csv_kwargs:
        blocksize: null
        dtype:
          P: &id001 !!python/name:pandas.core.arrays.string_.StringDtype ''
          cl_age90: *id001
          departement: !!python/name:builtins.str ''
          jour: *id001
          pop: *id001
        encoding: iso-8859-1
        sep: ','
      urlpath: https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e
    description: ''
    driver: intake.source.csv.CSVSource
    metadata: {}



In [159]:
from intake.catalog import Catalog
from intake.catalog.local import LocalCatalogEntry

mycat = Catalog.from_dict(
    {"cato": src},
    description="Données relatives aux résultats des tests virologiques COVID-19",
)

In [160]:
mycat

null:
  args:
    description: "Donn\xE9es relatives aux r\xE9sultats des tests virologiques COVID-19"
  description: "Donn\xE9es relatives aux r\xE9sultats des tests virologiques COVID-19"
  driver: intake.catalog.base.Catalog
  metadata: {}


In [138]:
type(mycat)

intake.catalog.base.Catalog

In [154]:
mycat.cato

csv:
  args:
    csv_kwargs:
      blocksize: null
      dtype:
        P: &id001 !!python/name:pandas.core.arrays.string_.StringDtype ''
        cl_age90: *id001
        departement: !!python/name:builtins.str ''
        jour: *id001
        pop: *id001
      encoding: iso-8859-1
      sep: ','
    urlpath: https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e
  description: ''
  driver: intake.source.csv.CSVSource
  metadata: {}


In [150]:
type(src)

intake.source.csv.CSVSource

In [156]:
mycat["cato"].description = "blabla"

In [157]:
mycat.save("aze.yaml")

In [158]:
%cat aze.yaml

metadata: {}
name: null
sources:
  cato:
    csv_kwargs:
      blocksize: null
      dtype:
        P: &id001 !!python/name:pandas.core.arrays.string_.StringDtype ''
        cl_age90: *id001
        departement: !!python/name:builtins.str ''
        jour: *id001
        pop: *id001
      encoding: iso-8859-1
      sep: ','
    parameters: {}
    urlpath: https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e


In [4]:
src.description

In [5]:
src.get()

csv:
  args:
    csv_kwargs:
      blocksize: null
      dtype:
        P: &id001 !!python/name:pandas.core.arrays.string_.StringDtype ''
        cl_age90: *id001
        departement: !!python/name:builtins.str ''
        jour: *id001
        pop: *id001
      encoding: iso-8859-1
      sep: ','
    urlpath: https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e
  description: ''
  driver: intake.source.csv.CSVSource
  metadata: {}


In [6]:
src.discover()

{'datashape': None,
 'dtype': {'extract_date': 'object',
  'departement': 'object',
  'region': 'int64',
  'libelle_reg': 'object',
  'libelle_dep': 'object',
  'tx_incid': 'float64',
  'R': 'float64',
  'taux_occupation_sae': 'float64',
  'tx_pos': 'float64',
  'tx_incid_couleur': 'object',
  'R_couleur': 'object',
  'taux_occupation_sae_couleur': 'object',
  'tx_pos_couleur': 'object',
  'nb_orange': 'int64',
  'nb_rouge': 'int64'},
 'shape': (None, 15),
 'npartitions': 1,
 'metadata': {}}

In [10]:
src.catalog_object

In [73]:
src.description = "AZE"

In [12]:
from pathlib import Path

In [15]:
Path("./one.yml").write_text(src.yaml())

498

In [11]:
print(src.yaml())

sources:
  csv:
    args:
      csv_kwargs:
        blocksize: null
        dtype:
          P: &id001 !!python/name:pandas.core.arrays.string_.StringDtype ''
          cl_age90: *id001
          departement: !!python/name:builtins.str ''
          jour: *id001
          pop: *id001
        encoding: iso-8859-1
        sep: ','
      urlpath: https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e
    description: ''
    driver: intake.source.csv.CSVSource
    metadata: {}



In [69]:
src.shape

(None, 15)

In [70]:
df = src.read()

In [None]:
print(src.yaml())

In [None]:
src.

In [16]:
urls = dict(
    indicateurs=dict(
        url_web="https://www.data.gouv.fr/fr/datasets/indicateurs-de-suivi-de-lepidemie-de-covid-19/",
        url_stable="https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e",
        url_api="https://www.data.gouv.fr/api/1/datasets/5ee9df5003284f565d561278/",
        titre="Indicateurs de suivi de l'épidémie de COVID-19",
        file_pattern="indicateurs-covid19-dep",
        delim=",",
    ),
    tests_positivite=dict(
        url_web="https://www.data.gouv.fr/fr/datasets/donnees-relatives-aux-resultats-des-tests-virologiques-covid-19/",
        url_stable="https://www.data.gouv.fr/fr/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675",
        url_api="https://www.data.gouv.fr/api/1/datasets/5ed117db6c161bd5baf070be",
        titre="Données relatives aux résultats des tests virologiques COVID-19 SI-DEP",
        file_pattern="sp-pos-quot-dep",
        delim=";",
    ),
    tests_capacites=dict(
        url_web="https://www.data.gouv.fr/fr/datasets/capacite-analytique-de-tests-virologiques-dans-le-cadre-de-lepidemie-covid-19/",
        url_stable="https://www.data.gouv.fr/fr/datasets/r/0c230dc3-2d51-4f17-be97-aa9938564b39",
        url_api="https://www.data.gouv.fr/api/1/datasets/5ed11705afd28672e40fbc2f/",
        titre="Capacité analytique de tests virologiques dans le cadre de l'épidémie COVID-19 SI-DEP",
        file_pattern="sp-capa-quot-dep",
        delim=";",
    ),
    incidence=dict(
        url_web="https://www.data.gouv.fr/fr/datasets/taux-dincidence-de-lepidemie-de-covid-19/",
        url_stable="https://www.data.gouv.fr/fr/datasets/r/19a91d64-3cd3-42fc-9943-d635491a4d76",
        url_api="https://www.data.gouv.fr/api/1/datasets/5ed1175ca00bbe1e4941a46a",
        titre="Taux d'incidence de l'épidémie de COVID-19 SI-DEP",
        file_pattern="sp-pe-tb-quot-dep",
        delim=";",
    ),
    sursaud=dict(
        url_web="https://www.data.gouv.fr/fr/datasets/donnees-des-urgences-hospitalieres-et-de-sos-medecins-relatives-a-lepidemie-de-covid-19/",
        url_stable="https://www.data.gouv.fr/fr/datasets/r/eceb9fb4-3ebc-4da3-828d-f5939712600a",
        url_api="https://www.data.gouv.fr/api/1/datasets/5e74ecf52eb7514f2d3b8845",
        titre="Données des urgences hospitalières et de SOS médecins relatives à l'épidémie de COVID-19",
        file_pattern="sursaud-corona-quot-dep",
        delim=";",
    ),
)

In [19]:
import requests

In [20]:
req = requests.get(urls["tests_positivite"]["url_api"])

In [22]:
resp = req.json()

In [None]:
resources = [r for r in resp["resources"] if "wordprocessingml" not in r["mime"]]

In [96]:
import re

In [106]:
resources[0]["title"].rstrip(r"\w*")

'sp-pos-quot-dep-2020-09-25-19h15.csv'

In [93]:
out = [
    {
        "args": {
            "csv_kwargs": {"blocksize": "null", "encoding": "iso-8850-1", "sep": ","}
        },
        "description": resource.get("description", None),
        "metadata": {"title": resp.get("title", None), "uri": resp.get("uri", None)},
    }
    for resource in resources
]

load(out)

  load(out)


AttributeError: 'list' object has no attribute 'read'

In [94]:
out

[{'args': {'csv_kwargs': {'blocksize': 'null',
    'encoding': 'iso-8850-1',
    'sep': ','}},
  'description': 'Taux de positivité - quotidien - département .',
  'metadata': {'title': 'Données relatives aux résultats des tests virologiques COVID-19',
   'uri': 'https://www.data.gouv.fr/api/1/datasets/donnees-relatives-aux-resultats-des-tests-virologiques-covid-19/'}},
 {'args': {'csv_kwargs': {'blocksize': 'null',
    'encoding': 'iso-8850-1',
    'sep': ','}},
  'description': 'Taux de positivité - quotidien - région.',
  'metadata': {'title': 'Données relatives aux résultats des tests virologiques COVID-19',
   'uri': 'https://www.data.gouv.fr/api/1/datasets/donnees-relatives-aux-resultats-des-tests-virologiques-covid-19/'}},
 {'args': {'csv_kwargs': {'blocksize': 'null',
    'encoding': 'iso-8850-1',
    'sep': ','}},
  'description': 'Taux de positivité - quotidien - france.',
  'metadata': {'title': 'Données relatives aux résultats des tests virologiques COVID-19',
   'uri': 'ht

In [75]:
for k in resp:
    print("*" * 80)
    print(k)
    print("-" * 80)
    print(resp.get(k))

********************************************************************************
acronym
--------------------------------------------------------------------------------
SI-DEP
********************************************************************************
archived
--------------------------------------------------------------------------------
None
********************************************************************************
badges
--------------------------------------------------------------------------------
[]
********************************************************************************
created_at
--------------------------------------------------------------------------------
2020-05-29T16:10:35.407000
********************************************************************************
deleted
--------------------------------------------------------------------------------
None
********************************************************************************
description
-------

In [32]:
resources = [r for r in resp["resources"] if "wordprocessingml" not in r["mime"]]

In [78]:
from yaml import load, dump

In [77]:
resources[0]

{'checksum': {'type': 'sha1',
  'value': 'eb395230851890fe96cbc0618000ae70e302decd'},
 'created_at': '2020-05-29T18:02:26.703000',
 'description': 'Taux de positivité - quotidien - département .',
 'extras': {},
 'filesize': 3361733,
 'filetype': 'file',
 'format': 'csv',
 'id': '406c6a23-e283-4300-9484-54e78c8ae675',
 'last_modified': '2020-09-25T19:15:06.758000',
 'latest': 'https://www.data.gouv.fr/fr/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675',
 'metrics': {'views': 8155},
 'mime': 'text/csv',
 'preview_url': '/tabular/preview/?url=https%3A%2F%2Fstatic.data.gouv.fr%2Fresources%2Fdonnees-relatives-aux-resultats-des-tests-virologiques-covid-19%2F20200925-191506%2Fsp-pos-quot-dep-2020-09-25-19h15.csv',
 'published': '2020-05-29T18:02:26',
 'schema': None,
 'title': 'sp-pos-quot-dep-2020-09-25-19h15.csv',
 'type': 'main',
 'url': 'https://static.data.gouv.fr/resources/donnees-relatives-aux-resultats-des-tests-virologiques-covid-19/20200925-191506/sp-pos-quot-dep-2020-09-25-19h15.c

In [80]:
print(dump(resources[0]))

checksum:
  type: sha1
  value: eb395230851890fe96cbc0618000ae70e302decd
created_at: '2020-05-29T18:02:26.703000'
description: "Taux de positivit\xE9 - quotidien - d\xE9partement ."
extras: {}
filesize: 3361733
filetype: file
format: csv
id: 406c6a23-e283-4300-9484-54e78c8ae675
last_modified: '2020-09-25T19:15:06.758000'
latest: https://www.data.gouv.fr/fr/datasets/r/406c6a23-e283-4300-9484-54e78c8ae675
metrics:
  views: 8155
mime: text/csv
preview_url: /tabular/preview/?url=https%3A%2F%2Fstatic.data.gouv.fr%2Fresources%2Fdonnees-relatives-aux-resultats-des-tests-virologiques-covid-19%2F20200925-191506%2Fsp-pos-quot-dep-2020-09-25-19h15.csv
published: '2020-05-29T18:02:26'
schema: null
title: sp-pos-quot-dep-2020-09-25-19h15.csv
type: main
url: https://static.data.gouv.fr/resources/donnees-relatives-aux-resultats-des-tests-virologiques-covid-19/20200925-191506/sp-pos-quot-dep-2020-09-25-19h15.csv



In [None]:
'description', 'latest', 'title'

In [76]:
for k in resources:
    print(k["title"])

sp-pos-quot-dep-2020-09-25-19h15.csv
sp-pos-quot-reg-2020-09-25-19h15.csv
sp-pos-quot-fra-2020-09-25-19h15.csv
sp-pos-heb-dep-2020-09-25-19h15.csv
sp-pos-heb-reg-2020-09-25-19h15.csv
sp-pos-heb-fra-2020-09-25-19h15.csv
sp-ti-tp-7j-dep-2020-07-03-19h15.csv
sp-ti-tp-7j-reg-2020-07-03-19h15.csv
sp-ti-tp-7j-fra-2020-07-03-19h15.csv
sp-pos-quot-2020-07-03-19h15.xlsx
sp-pos-heb-2020-07-01-19h15.xlsx
sp-ti-tp-7j-2020-07-03-19h15.xlsx
metadonnees-positivite.xlsx


In [None]:
for k in resources[-1]:
    print("*" * 80)
    print(k)
    print("-" * 80)
    print(resources[-1].get(k))

In [91]:
src["indicateurs"]

{'url_web': 'https://www.data.gouv.fr/fr/datasets/indicateurs-de-suivi-de-lepidemie-de-covid-19/',
 'url_stable': 'https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e',
 'url_api': 'https://www.data.gouv.fr/api/1/datasets/5ee9df5003284f565d561278/',
 'titre': "Indicateurs de suivi de l'épidémie de COVID-19",
 'file_pattern': 'indicateurs-covid19-dep',
 'delim': ','}