In [10]:
# -*- coding: utf-8 -*-
import requests
import os, sys
import pandas as pd
import urllib.parse
import logging
import concurrent.futures
from pathlib import Path

In [34]:
"""
Script to download dataset from https://climateknowledgeportal.worldbank.org/api/data/get-download-data
"""
# Path origin - downloaded dataset
DATASET_FOLDER = "~/dataset/"

# Paths destination
DATASET_FOLDER_DEST = "~/dataset_dest/"
PATH_temperature = os.path.join(DATASET_FOLDER_DEST, "temperature/")
PATH_rainfall = os.path.join(DATASET_FOLDER_DEST, "precipitation/")


nature_of_data = ["projection", "historical"]

In [37]:
# Read countries list
df = pd.read_csv(DATASET_FOLDER + "worldbank_countries.csv")
countries_code = df.code.to_list()
countries_name = df.name.to_list()

In [38]:
df.head(3)

Unnamed: 0,name,code
0,Belgium,BEL
1,China,CHN
2,Germany Fed Rep,DFR


#### Extraction of rainfall data

In [39]:
variables = ["pr"]

past_time_series = ["1901-2016"]
futu_time_series = ["2020_2039", "2040_2059", "2060_2079", "2080_2099"]

logger = logging.getLogger("download")
formatter = logging.Formatter("%(asctime)s -  %(name)-12s %(levelname)-8s %(message)s")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler("download.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.info(f"Starting...")

In [40]:
def get_url(url, destination):
    if Path(destination).is_file():
        logger.info(f"{destination} already exist ! No download.")
        return False
    logger.debug(f"{url} -> {destination}")
    # Retreive the content
    try:
        r = requests.get(url)
        content = r.content
        if r.status_code != 200:
            logger.error(f"ERROR HTTP : {r.status_code} for {url}")
            return False
        if len(r.content) < 1_000:
            logger.error(f"ERROR HTTP content too small : {content} for {url}")
            return False

        with open(destination, "wb") as f:
            f.write(content)
        return True
    except:
        logger.error(f"Unexpected ERROR for {url}: {sys.exc_info()[0]}")
        return False

In [41]:
Path(PATH_rainfall)

WindowsPath('C:/Users/Square 967/Documents/Jyda/DataForGood/data/precipitation')

In [42]:
past_time_series

['1901-2016']

In [43]:
nb_iter = 0
# asyncloop = asyncio.get_event_loop()
# tasks = []
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
    futures = []
    for country_code, country_name in zip(countries_code, countries_name):
        for nature in nature_of_data:
            time_series = (
                past_time_series if nature == "historical" else futu_time_series
            )
            data_type = "" if nature == "historical" else "/mavg"
            projection = "" if nature == "historical" else "/rcp85"
            for period in time_series:
                nb_iter += 1
                # Build URL
                url = (
                    "https://climateknowledgeportal.worldbank.org/api/data/get-download-data/"
                    + f"{nature}{data_type}/pr{projection}/{period}/{country_code}/{urllib.parse.quote_plus(country_name)}"
                )
                # build destination name
                filename = "_".join([nature, period, country_code]) + ".csv"
                destination = os.path.join(PATH_rainfall, filename)
                # tasks.append(asyncloop.create_task(get_url(url, destination)))
                futures.append(
                    executor.submit(get_url, url=url, destination=destination)
                )
    for future in concurrent.futures.as_completed(futures):
        # print(future.result())
        logger.debug(f"Done {future.result()}")
# for task in tasks:
#     await task

logger.info(f"Done after {nb_iter} iterations.")
# https://climateknowledgeportal.worldbank.org/api/data/get-download-data/projection/mavg/pr/rcp85/2060_2079/BDI/Burundi

logger.debug(f"Done {future.result()}")

In [36]:
DATASET_FOLDER_DEST

'C:/Users/Square 967/Documents/Jyda/DataForGood/data/'

#### Merge Rainfall files in CSV

In [44]:
import os
import glob

os.chdir(DATASET_FOLDER_DEST + "precipitation/")

extension = "csv"
all_filenames = [i for i in glob.glob("*.{}".format(extension))]

# combine all files in the list
combined_rain = pd.concat([pd.read_csv(f) for f in all_filenames])
# export to csv
combined_rain.to_csv("combined_rainfall.csv", index=False, encoding="utf-8-sig")

In [45]:
combined_rain.columns

Index(['Rainfall - (MM)', ' Year', ' Statistics', ' Country', ' ISO3',
       'Monthly Precipitation - (MM)', ' Model'],
      dtype='object')

#### Extraction of temperature data

In [46]:
variables = ["tas"]

past_time_series = ["1901-2016"]
futu_time_series = ["2020_2039", "2040_2059", "2060_2079", "2080_2099"]

logger = logging.getLogger("download")
formatter = logging.Formatter("%(asctime)s -  %(name)-12s %(levelname)-8s %(message)s")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler("download.log")
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.info(f"Starting...")

In [47]:
def get_url(url, destination):
    if Path(destination).is_file():
        logger.info(f"{destination} already exist ! No download.")
        return False
    logger.debug(f"{url} -> {destination}")
    # Retreive the content
    try:
        r = requests.get(url)
        content = r.content
        if r.status_code != 200:
            logger.error(f"ERROR HTTP : {r.status_code} for {url}")
            return False
        if len(r.content) < 1_000:
            logger.error(f"ERROR HTTP content too small : {content} for {url}")
            return False

        with open(destination, "wb") as f:
            f.write(content)
        return True
    except:
        logger.error(f"Unexpected ERROR for {url}: {sys.exc_info()[0]}")
        return False

In [48]:
nb_iter = 0
# asyncloop = asyncio.get_event_loop()
# tasks = []
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
    futures = []
    for country_code, country_name in zip(countries_code, countries_name):
        for nature in nature_of_data:
            time_series = (
                past_time_series if nature == "historical" else futu_time_series
            )
            data_type = "" if nature == "historical" else "/mavg"
            projection = "" if nature == "historical" else "/rcp85"
            for period in time_series:
                nb_iter += 1
                # Build URL
                url = (
                    "https://climateknowledgeportal.worldbank.org/api/data/get-download-data/"
                    + f"{nature}{data_type}/tas{projection}/{period}/{country_code}/{urllib.parse.quote_plus(country_name)}"
                )
                # build destination name
                filename = "_".join([nature, period, country_code]) + ".csv"
                destination = os.path.join(PATH_temperature, filename)
                # tasks.append(asyncloop.create_task(get_url(url, destination)))
                futures.append(
                    executor.submit(get_url, url=url, destination=destination)
                )
    for future in concurrent.futures.as_completed(futures):
        # print(future.result())
        logger.debug(f"Done {future.result()}")
# for task in tasks:
#     await task

logger.info(f"Done after {nb_iter} iterations.")
# https://climateknowledgeportal.worldbank.org/api/data/get-download-data/projection/mavg/pr/rcp85/2060_2079/BDI/Burundi

logger.debug(f"Done {future.result()}")

#### Merge Temperature files in CSV

In [49]:
import os
import glob

os.chdir(DATASET_FOLDER_DEST + "temperature/")

extension = "csv"
all_filenames = [i for i in glob.glob("*.{}".format(extension))]

# combine all files in the list
combined_temp = pd.concat([pd.read_csv(f) for f in all_filenames])
# export to csv
combined_temp.to_csv("combined_temperature.csv", index=False, encoding="utf-8-sig")