In [None]:
!pip install owid-catalog

In [None]:
from owid.catalog import RemoteCatalog
# find the default OWID catalog and fetch the catalog index over HTTPS
catalog = RemoteCatalog(channels=('garden', 'meadow', 'open_numbers'))

In [None]:
# All namespaces = main sources
catalog.datasets["namespace"].unique()

In [None]:
# GET LATEST VERSIONS of datasets
import pandas as pd
import numpy as np
from dataclasses import asdict

# Get datasets from owid remote catalog
datasets = catalog.datasets

# Group the data by 'namespace, dataset' and get latest version
datasets = datasets.groupby(['namespace', 'dataset'])['version'].max()
datasets = datasets.reset_index()

# Filter for performance/tests
# datasets = datasets[datasets["dataset"].str.contains("energy|consumption")]

print(datasets)

# Dataframe result including dataset metadata
df = pd.DataFrame()

for index, row in datasets.iterrows():
    try: 
        print('Getting metadata: index='+ str(index) + ' namespace=' + row.namespace + ', dataset=' + row.dataset)
        dataset = catalog.find_latest(namespace=row.namespace, dataset=row.dataset)
        df = pd.concat([df, pd.DataFrame([asdict(dataset.metadata.dataset)])], ignore_index=True)
    except:
        print('ERROR: namespace=' + row.namespace + ', dataset=' + row.dataset)

# Explode sources
df = df.explode(["sources"])
# df = df.explode(["licenses"])
df = df.reset_index()

# Extract the dictionary values into new DataFrame columns using apply and lambda function
df = pd.concat([df.drop(columns='sources'), df['sources'].apply(pd.Series)], axis=1)
# df = pd.concat([df.drop(columns='licenses'), df['licenses'].apply(pd.Series)], axis=1)

df.to_csv('../../processed/owid_catalog.csv', index=False)
df


In [None]:
# Population
df_pop = df[df["dataset"].str.contains("population")]
df_pop

data_pop = catalog.find_latest(namespace='gapminder', dataset='population')
data_pop


In [None]:

# GH CO2 Emmissions filter
df_gh = df[df["dataset"].str.contains("gh|green|house|carbon|co2|emission")]
df_gh


In [None]:
# NRJ filter
df_nrj = df[df["dataset"].str.contains("energy|consumption|final")]
df_nrj


In [None]:
#  !!!! FROM SHIFT DATA PORTAL (UP TO 2016)
sdp = catalog.find_latest(namespace='shift')
print(sdp.metadata.dataset)
sdp.sort_values('year', ascending=False).head()


In [None]:
# DEFAULT LATEST EIA

df = catalog.find_latest(namespace='eia', dataset='energy_consumption')
print(df.metadata.dataset)
df

In [None]:
# DEFAULT LATEST BP

df = catalog.find_latest(namespace='bp', dataset='energy_mix')
print(df.metadata.dataset)
df