## Taux de remplissage

In [1]:
import pandas as pd
from utils.utils import wrapper_engine

engine = wrapper_engine('config.ini')

# Load data
with engine.connect() as connection:
    package = pd.read_sql_table(
        table_name="package", 
        con=connection,
        schema="public",
        )
    
    package_extra = pd.read_sql_table(
        table_name="package_extra", 
        con=connection,
        schema="public",
        )

In [2]:
# See metadata: rights
empty_metadata = [
    '{"fr": [], "en": []}',
    '{}'
]

fill_rate = package_extra[["key", "value", "package_id"]].copy(deep=True)
fill_rate["value"] = fill_rate["value"].apply(lambda x: None if x in empty_metadata else x)
fill_rate.dropna(inplace=True)

fill_rate = fill_rate.groupby("key", as_index=False).count()
fill_rate["package_id"] = fill_rate["package_id"].apply(lambda x: x/len(package[package["type"]=='dataset'])*100)

In [3]:
import plotly
import plotly.express as px

plotly.offline.init_notebook_mode(connected=True)
fig = px.bar(fill_rate.sort_values(by=["package_id", "key"], ascending=False),
             x='key', 
             y='package_id',
             color_discrete_sequence=['#000091'],
             labels={"key": "Métadonnée",
                    "package_id": "Remplissage [%]"})
fig.add_hline(
    y=int(fill_rate["package_id"].median()), 
    line_width=3, 
    line_dash="dash", 
    line_color="red",
    annotation_text="médiane",
    annotation_textangle = 0)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
plotly.offline.iplot(fig)

In [None]:
# Load data
with engine.connect() as connection:
    group = pd.read_sql_table(
        table_name="group", 
        con=connection,
        schema="public",
        ).rename(columns={"id":"id_group"})
    
datasets_per_group = group[["id_group", "title"]].merge(package.query("type == 'dataset'")[["id", "owner_org"]], how="left", left_on="id_group", right_on="owner_org").groupby(by=["title"], as_index=False).count()
datasets_per_group.sort_values(by=["owner_org", "title"], ascending=False, inplace=True)

In [None]:
plotly.offline.init_notebook_mode(connected=True)
fig = px.bar(datasets_per_group,
             x='title', 
             y='owner_org',
             color_discrete_sequence=['#000091'],
             labels={"title": "Organisation",
                    "owner_org": "Nombre de datasets"})
fig.add_hline(
    y=int(datasets_per_group["owner_org"].median()), 
    line_width=3, 
    line_dash="dash", 
    line_color="red",
    annotation_text="médiane",
    annotation_textangle = 0)
fig.update_layout(margin=dict(l=20, r=20, t=20, b=20))
plotly.offline.iplot(fig)