# Cleaning Global Knowledge Portal Data

In [2]:
# For multiple output per cell
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [3]:
# DATASET_FOLDER = '/media/data-nvme/dev/datasets/WorldBank/'
DATASET_FOLDER = "../../datasets/"

In [4]:
import os
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import glob
import traceback
import sys
import numpy as np

In [4]:
rcp_projection = ["rcp26", "rcp45", "rcp60", "rcp85"]


def abreviation2nombre(abr):
    lst_abr = [
        "Jan",
        "Feb",
        "Mar",
        "Apr",
        "May",
        "Jun",
        "Jul",
        "Aug",
        "Sep",
        "Oct",
        "Nov",
        "Dec",
    ]
    return lst_abr.index(abr) + 1


def read_onefile(filename):
    country_error_search_string = [
        "The",
        "State of",
        "United Republic of",
        "Democratic People’s Republic of",
        "Republic of",
    ]
    df = pd.read_csv(filename, sep=r", ", engine="python")
    if "historical" in filename:
        # S'il y a un problème de vigule on nettoit le dataframe
        if "Country" in df.columns and df.Country.all() in country_error_search_string:
            metric = list(df.columns)[0]
            # print(metric)
            df.reset_index(inplace=True)
            # On récuppère le nom réel
            df["new_Country"] = df[["Statistics", "Country"]].apply(
                lambda x: x[0] + ", " + x[1], axis=1
            )
            df.drop("Country", axis=1, inplace=True)
            df.rename(
                columns={
                    "index": metric,
                    metric: "Year",
                    "Year": "Statistics",
                    "Statistics": "tmp",
                    "new_Country": "Country",
                },
                inplace=True,
            )
            df.drop("tmp", axis=1, inplace=True)
            df.drop("Country", axis=1, inplace=True)
    else:
        # Create a culumn for RCP
        for rcp in rcp_projection:
            if rcp in filename:
                df["RCP"] = rcp
        # S'il y a un problème de vigule on nettoit le dataframe
        if "Country" in df.columns and df.Country.all() in country_error_search_string:
            metric = list(df.columns)[0]
            # print(metric)
            df.reset_index(inplace=True)
            # On récuppère le nom réel
            df["new_Country"] = df[["Statistics", "Country"]].apply(
                lambda x: x[0] + ", " + x[1], axis=1
            )
            df.drop("Country", axis=1, inplace=True)
            df.rename(
                columns={
                    "index": metric,
                    metric: "Year",
                    "Year": "Model",
                    "Model": "Statistics",
                    "Statistics": "tmp",
                    "new_Country": "Country",
                },
                inplace=True,
            )
            df.drop("tmp", axis=1, inplace=True)
            df.drop("Country", axis=1, inplace=True)
        if "rx5dayreturnlevel25" in filename:
            # There is a bug in the file
            df.rename(
                columns={
                    "Expected Daily Rainfall Maximum in 25 Years (25-yr Return Level) - (MM)": "Expected 5-day Cumulative Rainfall Maximum in 25 Years (25-yr Return Level) - (MM)"
                },
                inplace=True,
            )
    return df

In [5]:
read_onefile(
    DATASET_FOLDER + "/precipitation/projection_2040_2059_FRA_rcp26_rx5day.csv"
).head(2)

Unnamed: 0,Largest 5-day Cumulative Rainfall - (MM),Year,Model,Statistics,ISO3,RCP
0,4.3227,2040-2059,bcc_csm1_1_m,Jan Anomaly,FRA,rcp26
1,-3.6732,2040-2059,bcc_csm1_1_m,Feb Anomaly,FRA,rcp26


In [6]:
read_onefile(
    DATASET_FOLDER + "/precipitation/projection_2040_2059_GMB_rcp26_rx5day.csv"
).head(2)

Unnamed: 0,Largest 5-day Cumulative Rainfall - (MM),Year,Model,Statistics,ISO3,RCP
0,-0.0949,2040-2059,bcc_csm1_1_m,Jan Anomaly,GMB,rcp26
1,-0.0774,2040-2059,bcc_csm1_1_m,Feb Anomaly,GMB,rcp26


In [7]:
read_onefile(DATASET_FOLDER + "/precipitation/historical_1901-2016_FRA__mavg.csv").head(
    2
)

Unnamed: 0,Rainfall - (MM),Year,Statistics,Country,ISO3
0,40.929,1901,Jan Average,France,FRA
1,34.7865,1901,Feb Average,France,FRA


In [8]:
read_onefile(DATASET_FOLDER + "/precipitation/historical_1901-2016_GMB__mavg.csv").head(
    2
)

Unnamed: 0,Rainfall - (MM),Year,Statistics,ISO3
0,0.0,1901,Jan Average,GMB
1,0.0,1901,Feb Average,GMB


In [9]:
def gen_dataset_country(country):
    """Créer les deux fichiers : historique_precipitation_clean.csv et projection_precipitation_clean.csv"""
    df_hist = pd.DataFrame()
    df_pred = pd.DataFrame()
    for filename in glob.glob(DATASET_FOLDER + "precipitation/*" + country + "*.csv"):
        try:
            df = read_onefile(filename)
            if "historical" in filename:
                df_hist = df_hist.append(df)
            else:
                df_pred = df_pred.append(df)
        except Exception as err:
            print("ERROR reading", filename)
            traceback.print_tb(err.__traceback__)
    if len(df_hist) < 10:
        print("ERROR : no History data for", country)
    else:
        # Extract Month number for History
        df_hist["Month"] = (
            df_hist["Statistics"].str[:4].apply(lambda x: abreviation2nombre(x.strip()))
        )
        df_hist.sort_values(["ISO3", "Year", "Month"], inplace=True)
        df_hist.to_csv(
            f"{DATASET_FOLDER}historical_precipitation/historical_precipitation_clean_"
            + country
            + ".csv",
            index=False,
        )
    if len(df_pred) < 10:
        print("ERROR : no projection data for", country)
    else:
        # Extract Month number for projection
        df_pred["Month"] = (
            df_pred["Statistics"]
            .str[:4]
            .apply(lambda x: abreviation2nombre(x.strip()) if x != "Annu" else np.NaN)
        )
        df_pred.sort_values(["ISO3", "Year", "Model", "Month"], inplace=True)
        df_pred.to_csv(
            f"{DATASET_FOLDER}projection_precipitation/projection_precipitation_clean_"
            + country
            + ".csv",
            index=False,
        )
    return country

In [10]:
gen_dataset_country("GMB")

'GMB'

In [11]:
gen_dataset_country("XRK")

ERROR : no History data for XRK


'XRK'

In [12]:
df = pd.read_csv(DATASET_FOLDER + "worldbank_countries.csv")
countries_code = df.code.to_list()
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
    futures = []
    for iso3 in countries_code:
        futures.append(executor.submit(gen_dataset_country, country=iso3))
    for future in concurrent.futures.as_completed(futures):
        print(f"Done {future.result()}")

ERROR : no History data for HKG
ERROR : no projection data for HKG
Done HKG
ERROR : no History data for DFR
ERROR : no projection data for DFR
Done DFR
Done CHL
Done CRI
Done HUN
Done GHA
Done GBR
Done DZA
Done AUS
Done ECU
Done IDN
Done BEN
Done EGY
Done IND
Done GUY
Done CHN
Done BEL
Done ESP
Done BGD
Done BRA
Done COL
Done CAN
Done BWA
Done GTM
Done FRA
Done ETH
Done FJI
Done CAF
Done HND
Done BOL
Done AFG
Done BRB
Done HTI
Done ARG
Done ROU
Done LBN
Done NZL
Done TUR
Done JPN
Done PAK
Done CHE
Done CUB
Done GRD
Done IRQ
Done JOR
Done TZA
Done USA
Done KOR
Done JAM
Done DJI
Done GRC
Done IRN
Done BFA
Done NLD
Done TUN
ERROR : no History data for TWN
ERROR : no projection data for TWN
Done TWN
Done MEX
Done MMR
ERROR : no History data for YUG
ERROR : no projection data for YUG
Done YUG
Done ITA
Done NIC
ERROR : no History data for YMD
ERROR : no projection data for YMD
Done YMD
ERROR : no History data for YMN
ERROR : no projection data for YMN
Done YMN
Done MAR
Done NPL
Done SOM
Done

### Check

In [13]:
fra = pd.read_csv(
    DATASET_FOLDER + "historical_precipitation/historical_precipitation_clean_GMB.csv"
)
fra.head(3)
del fra

Unnamed: 0,Rainfall - (MM),Year,Statistics,ISO3,Month
0,0.0,1901,Jan Average,GMB,1
1,0.0,1901,Feb Average,GMB,2
2,0.0,1901,Mar Average,GMB,3


In [14]:
fra = pd.read_csv(
    DATASET_FOLDER + "projection_precipitation/projection_precipitation_clean_XRK.csv"
)
fra.head(3)
fra.columns
del fra

Unnamed: 0,Monthly Precipitation - (MM),Year,Model,Statistics,Country,ISO3,RCP,Expected 5-day Cumulative Rainfall Maximum in 10 Years (10-yr Return Level) - (MM),Month
0,-24.050205,2020-2039,Ensemble (10th Percentile),Jan Average,Kosovo,XRK,rcp45,,1.0
1,-21.30951,2020-2039,Ensemble (10th Percentile),Jan Average,Kosovo,XRK,rcp85,,1.0
2,-23.793648,2020-2039,Ensemble (10th Percentile),Jan Average,Kosovo,XRK,rcp26,,1.0


Index(['Monthly Precipitation - (MM)', 'Year', 'Model', 'Statistics',
       'Country', 'ISO3', 'RCP',
       'Expected 5-day Cumulative Rainfall Maximum in 10 Years (10-yr Return Level) - (MM)',
       'Month'],
      dtype='object')

## Merge all files

### Historical

In [5]:
df_hist = pd.DataFrame()
for filename in glob.glob(DATASET_FOLDER + "historical_precipitation/*.csv"):
    df = pd.read_csv(filename)
    df_hist = df_hist.append(df)
df_hist.to_csv(
    f"{DATASET_FOLDER}historical_precipitation_clean_2020-12-02.csv", index=False
)

In [6]:
df_hist.head(3)

Unnamed: 0,Rainfall - (MM),Year,Statistics,Country,ISO3,Month
0,73.9679,1901,Jan Average,Liechtenstein,LIE,1
1,64.055,1901,Feb Average,Liechtenstein,LIE,2
2,208.607,1901,Mar Average,Liechtenstein,LIE,3


In [None]:
# dict(df_hist.ISO3.value_counts())

### Projection

In [None]:
df_pred = pd.DataFrame()
for filename in tqdm(glob.glob(DATASET_FOLDER + "projection_precipitation/*.csv")):
    df = pd.read_csv(filename, low_memory=False)
    df_pred = df_pred.append(df)
rename = {
    "Monthly Precipitation - (MM)": "monthly_prcp_mm",
    "Year": "year",
    "Model": "model",
    "Statistics": "statistics",
    "ISO3": "ISO3",
    "RCP": "projection_rcp",
    "Largest Single Day Rainfall - (MM)": "largest_single_day_rain_mm",
    "Largest 5-day Cumulative Rainfall - (MM)": "largest_5-day_rain_sum_mm",
    "Expected Daily Rainfall Maximum in 10 Years (10-yr Return Level) - (MM)": "daily_rain_max_10_years_mm",
    "Expected Daily Rainfall Maximum in 25 Years (25-yr Return Level) - (MM)": "daily_rain_max_25_years_mm",
    "Expected 5-day Cumulative Rainfall Maximum in 10 Years (10-yr Return Level) - (MM)": "5-day_rain_sum_max_10_years_mm",
    "Expected 5-day Cumulative Rainfall Maximum in 25 Years (25-yr Return Level) - (MM)": "5-day_rain_sum_max_25_years_mm",
    "Expected Largest Monthly Rainfall Amount in 25 Years (25-yr Return Level) - (MM)": "largest_month_rain_25_years_mm",
    "Expected Largest Monthly Rainfall Amount in 10 Years (10-yr Return Level) - (MM)": "largest_month_rain_10_years_mm",
    "Number of Days with Rainfall > 20mm - (Days)": "nb_days_with_rain_>_20mm",
    "Number of Days with Rainfall > 50mm - (Days)": "nb_days_with_rain_>_50mm",
    "Rainfall Amount from Very Wet Days - (Percentage)": "rain_from_very_wet_days_percent",
    "Month": "month",
}
df_pred = df_pred.rename(columns=rename)
df_pred.to_csv(
    f"{DATASET_FOLDER}projection_precipitation_clean_2020-12-01.csv", index=False
)
df_pred.head(3)

In [None]:
df_pred = df_pred.rename(columns=rename)
df_pred.to_csv(
    f"{DATASET_FOLDER}projection_precipitation_clean_2020-12-02.csv", index=False
)
df_pred.head(3)

In [None]:
len(df_pred)

In [None]:
df_pred

In [None]:
# dict(df_pred['ISO3'].value_counts())

In [None]:
df_pred = pd.read_csv(
    f"{DATASET_FOLDER}projection_precipitation_clean_2020-12-02.csv", low_memory=False
)

In [None]:
df_pred.query(
    "ISO3 == 'FRA' and projection_rcp=='rcp26' and year=='2020-2039' and model=='bcc_csm1_1'"
)