## Analyse des Métadonnées

In [1]:
import pandas as pd
from datetime import datetime
from IPython.display import Markdown as md
from utils import wrapper_engine

engine = wrapper_engine('config.ini')

# Load data
with engine.connect() as connection:
    package = pd.read_sql_table(
        table_name="package", 
        con=connection,
        schema="public",
        )
    
    package_extra = pd.read_sql_table(
        table_name="package_extra", 
        con=connection,
        schema="public",
        )

date = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
md("**Création du document** : {} \n\n **Environnement** : {}\n\n**Nombre de datasets** : {}".format(date, engine.url.host, len(package[package["type"]=='dataset'])))

**Création du document** : 21/07/2023 16:54:36 

 **Environnement** : ed4too-preprod-ecosql-pg.cgdd-pre3.eco4.cloud.e2.rie.gouv.fr

**Nombre de datasets** : 21716

**Taux de remplissage**

In [12]:
# See metadata: rights
empty_metadata = [
    '{"fr": [], "en": []}',
    '{}'
]

keys_to_drop = [
    'harvest_object_id',
    'harvest_source_id',
    'harvest_source_title'
]

fill_rate = package_extra[["key", "value", "package_id"]].copy(deep=True)

fill_rate["value"] = fill_rate["value"].apply(lambda x: None if x in empty_metadata else x)
fill_rate.dropna(inplace=True)

fill_rate = fill_rate[~fill_rate["key"].isin(keys_to_drop)]

fill_rate = fill_rate.groupby("key", as_index=False).count()
fill_rate["package_id"] = fill_rate["package_id"].apply(lambda x: x/len(package[package["type"]=='dataset'])*100)

In [15]:
import plotly
import plotly.express as px

plotly.offline.init_notebook_mode(connected=True)
fig = px.bar(fill_rate.sort_values(by=["package_id", "key"], ascending=False),
             x='key', 
             y='package_id',
             color_discrete_sequence=['#000091'],
            labels={"key": "Métadonnée",
                    "package_id": "Remplissage [%]"})
fig.add_hline(
    y=int(fill_rate["package_id"].median()), 
    line_width=3, 
    line_dash="dash", 
    line_color="red",
    annotation_text="médiane",
    annotation_textangle = 0)
plotly.offline.iplot(fig)