# Cleaning Global Knowledge Portal Data

In [1]:
# For multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#DATASET_FOLDER = '/media/data-nvme/dev/datasets/WorldBank/'
DATASET_FOLDER = '../../datasets/'

In [3]:
import os
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import glob
import traceback
import sys
import numpy as np

In [4]:
rcp_projection = ['rcp26', 'rcp45','rcp60', 'rcp85']

def abreviation2nombre(abr):
        lst_abr = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        return(lst_abr.index(abr)+1)

def read_onefile(filename):
    country_error_search_string = ['The', 'State of', 'United Republic of', 'Democratic People’s Republic of', 'Republic of']
    df = pd.read_csv(filename, sep=r', ', engine='python')
    if 'historical' in filename:
        # S'il y a un problème de vigule on nettoit le dataframe
        if 'Country' in df.columns and df.Country.all() in country_error_search_string:
            metric =  list(df.columns)[0]
            #print(metric)
            df.reset_index(inplace=True)
            # On récuppère le nom réel
            df['new_Country'] = df[['Statistics', 'Country']].apply(lambda x: x[0]+', '+x[1], axis=1)
            df.drop('Country', axis=1, inplace=True)
            df.rename(columns={'index': metric,
                               metric: 'Year',
                               'Year': 'Statistics', 
                               'Statistics': 'tmp',
                               'new_Country': 'Country'}, inplace=True)
            df.drop('tmp', axis=1, inplace=True)
            df.drop('Country', axis=1, inplace=True)
    else:
        # Create a culumn for RCP
        for rcp in rcp_projection:
            if rcp in filename:
                df['RCP'] = rcp
        # S'il y a un problème de vigule on nettoit le dataframe
        if 'Country' in df.columns and df.Country.all() in country_error_search_string:
            metric =  list(df.columns)[0]
            #print(metric)
            df.reset_index(inplace=True)
            # On récuppère le nom réel
            df['new_Country'] = df[['Statistics', 'Country']].apply(lambda x: x[0]+', '+x[1], axis=1)
            df.drop('Country', axis=1, inplace=True)
            df.rename(columns={'index': metric,
                metric: 'Year',
                'Year': 'Model',
                'Model': 'Statistics',
                'Statistics': 'tmp',
                'new_Country': 'Country'}, inplace=True)
            df.drop('tmp', axis=1, inplace=True)
            df.drop('Country', axis=1, inplace=True)
        if 'rx5dayreturnlevel25' in filename:
            # There is a bug in the file
            df.rename(columns={'Expected Daily Rainfall Maximum in 25 Years (25-yr Return Level) - (MM)': 'Expected 5-day Cumulative Rainfall Maximum in 25 Years (25-yr Return Level) - (MM)'}, inplace=True)
    return df

In [5]:
read_onefile(DATASET_FOLDER+'/precipitation/projection_2040_2059_FRA_rcp26_rx5day.csv').head(2)

Unnamed: 0,Largest 5-day Cumulative Rainfall - (MM),Year,Model,Statistics,ISO3,RCP
0,4.3227,2040-2059,bcc_csm1_1_m,Jan Anomaly,FRA,rcp26
1,-3.6732,2040-2059,bcc_csm1_1_m,Feb Anomaly,FRA,rcp26


In [6]:
read_onefile(DATASET_FOLDER+'/precipitation/projection_2040_2059_GMB_rcp26_rx5day.csv').head(2)

Unnamed: 0,Largest 5-day Cumulative Rainfall - (MM),Year,Model,Statistics,ISO3,RCP
0,-0.0949,2040-2059,bcc_csm1_1_m,Jan Anomaly,GMB,rcp26
1,-0.0774,2040-2059,bcc_csm1_1_m,Feb Anomaly,GMB,rcp26


In [7]:
read_onefile(DATASET_FOLDER+'/precipitation/historical_1901-2016_FRA__mavg.csv').head(2)

Unnamed: 0,Rainfall - (MM),Year,Statistics,Country,ISO3
0,40.929,1901,Jan Average,France,FRA
1,34.7865,1901,Feb Average,France,FRA


In [8]:
read_onefile(DATASET_FOLDER+'/precipitation/historical_1901-2016_GMB__mavg.csv').head(2)

Unnamed: 0,Rainfall - (MM),Year,Statistics,ISO3
0,0.0,1901,Jan Average,GMB
1,0.0,1901,Feb Average,GMB


In [9]:
def gen_dataset_country(country):
    '''Créer les deux fichiers : historique_precipitation_clean.csv et projection_precipitation_clean.csv
    '''
    df_hist = pd.DataFrame()
    df_pred = pd.DataFrame()
    for filename in glob.glob(DATASET_FOLDER + 'precipitation/*'+country+'*.csv'): 
        try:
            df = read_onefile(filename)
            if 'historical' in filename:
                df_hist = df_hist.append(df)
            else:
                df_pred = df_pred.append(df)
        except Exception as err:
            print('ERROR reading', filename)
            traceback.print_tb(err.__traceback__)
    if (len(df_hist) < 10 ):
        print('ERROR : no History data for', country)
    else:       
        # Extract Month number for History
        df_hist['Month'] = df_hist['Statistics'].str[:4].apply(lambda x: abreviation2nombre(x.strip()))
        df_hist.sort_values(['ISO3','Year','Month'], inplace=True)
        df_hist.to_csv(f'{DATASET_FOLDER}historical_precipitation/historical_precipitation_clean_' + country + '.csv', index=False)
    if (len(df_pred) < 10 ):
        print('ERROR : no projection data for', country)
    else:
        # Extract Month number for projection
        df_pred['Month'] = df_pred['Statistics'].str[:4].apply(lambda x: abreviation2nombre(x.strip()) if x != 'Annu' else np.NaN)
        df_pred.sort_values(['ISO3','Year','Model','Month'], inplace=True)
        df_pred.to_csv(f'{DATASET_FOLDER}projection_precipitation/projection_precipitation_clean_' + country + '.csv', index=False)
    return country

In [10]:
gen_dataset_country('GMB')

'GMB'

In [11]:
gen_dataset_country('XRK')

ERROR : no History data for XRK


'XRK'

In [12]:
df = pd.read_csv(DATASET_FOLDER + 'worldbank_countries.csv')
countries_code = df.code.to_list()
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
    futures = []
    for iso3 in countries_code:
        futures.append(executor.submit(gen_dataset_country, country=iso3))
    for future in concurrent.futures.as_completed(futures):
        print(f'Done {future.result()}')

Done BHR
Done AND
Done BWA
Done BFA
Done BHS
Done AFG
Done BLR
Done BRB
Done AGO
Done AZE
Done BGR
Done BLZ
Done AUT
Done BRA
Done BEN
Done BDI
Done ARG
Done CMR
Done CPV
Done ALB
Done CAN
Done BRN
Done DZA
Done ATG
Done KHM
Done BEL
Done BIH
Done BTN
Done BGD
Done AUS
Done BOL
Done ARM
Done CAF
Done TCD
Done CHL
Done COM
Done COD
Done CHN
Done COL
Done CRI
Done COG
Done CIV
Done ERI
Done DOM
Done DJI
Done DNK
Done EST
Done ECU
Done GMB
Done CYP
Done FRA
Done CUB
Done DMA
Done HRV
Done GAB
Done FIN
Done CZE
Done SLV
Done EGY
Done ETH
Done FJI
ERROR : no History data for GNQ
Done GNQ
Done FSM
Done GEO
Done FRO
Done DEU
Done GRC
Done GHA
Done GRD
Done GTM
Done GRL
ERROR : no History data for XRK
Done XRK
Done GIN
Done GNB
Done HTI
Done GUY
Done HND
Done HUN
Done JOR
Done IRQ
Done KOR
Done ISR
Done IRL
Done ISL
Done IND
Done LAO
Done PRK
Done KGZ
Done JAM
Done KIR
Done IDN
Done ITA
Done IRN
Done KAZ
Done KWT
Done JPN
Done KEN
Done LVA
Done LBY
Done LBR
Done LSO
Done LBN
Done LIE
Done LTU


### Check

In [13]:
fra = pd.read_csv(DATASET_FOLDER + 'historical_precipitation/historical_precipitation_clean_GMB.csv')
fra.head(3)
del fra

Unnamed: 0,Rainfall - (MM),Year,Statistics,ISO3,Month
0,0.0,1901,Jan Average,GMB,1
1,0.0,1901,Feb Average,GMB,2
2,0.0,1901,Mar Average,GMB,3


In [14]:
fra = pd.read_csv(DATASET_FOLDER + 'projection_precipitation/projection_precipitation_clean_XRK.csv')
fra.head(3)
fra.columns
del fra

Unnamed: 0,Monthly Precipitation - (MM),Year,Model,Statistics,Country,ISO3,RCP,Expected 5-day Cumulative Rainfall Maximum in 10 Years (10-yr Return Level) - (MM),Month
0,-24.050205,2020-2039,Ensemble (10th Percentile),Jan Average,Kosovo,XRK,rcp45,,1.0
1,-21.30951,2020-2039,Ensemble (10th Percentile),Jan Average,Kosovo,XRK,rcp85,,1.0
2,-23.793648,2020-2039,Ensemble (10th Percentile),Jan Average,Kosovo,XRK,rcp26,,1.0


Index(['Monthly Precipitation - (MM)', 'Year', 'Model', 'Statistics',
       'Country', 'ISO3', 'RCP',
       'Expected 5-day Cumulative Rainfall Maximum in 10 Years (10-yr Return Level) - (MM)',
       'Month'],
      dtype='object')

## Merge all files

### Historical

In [15]:
df_hist = pd.DataFrame()
for filename in glob.glob(DATASET_FOLDER + 'historical_precipitation/*.csv'):
    df = pd.read_csv(filename)
    df_hist = df_hist.append(df)
df_hist.to_csv(f'{DATASET_FOLDER}historical_precipitation_clean_2020-12-01.csv', index=False)

In [16]:
df_hist.head(3)

Unnamed: 0,Rainfall - (MM),Year,Statistics,Country,ISO3,Month
0,73.9679,1901,Jan Average,Liechtenstein,LIE,1
1,64.055,1901,Feb Average,Liechtenstein,LIE,2
2,208.607,1901,Mar Average,Liechtenstein,LIE,3


In [17]:
#dict(df_hist.ISO3.value_counts())

### Projection

In [18]:
df_pred = pd.DataFrame()
for filename in tqdm(glob.glob(DATASET_FOLDER + 'projection_precipitation/*.csv')):
    df = pd.read_csv(filename, low_memory=False)
    df_pred = df_pred.append(df)
rename = {
     'Monthly Precipitation - (MM)' : 'monthly_prcp_mm',
    'Year' : 'year',
    'Model' : 'model',
    'Statistics' : 'statistics',
    'ISO3' : 'ISO3',
    'RCP' : 'projection_rcp',
    'Largest Single Day Rainfall - (MM)' :                                                'largest_single_day_rain_mm',
    'Largest 5-day Cumulative Rainfall - (MM)' :                                          'largest_5-day_rain_sum_mm',
    'Expected Daily Rainfall Maximum in 10 Years (10-yr Return Level) - (MM)' :           'daily_rain_max_10_years_mm',
    'Expected Daily Rainfall Maximum in 25 Years (25-yr Return Level) - (MM)' :           'daily_rain_max_25_years_mm',
    'Expected 5-day Cumulative Rainfall Maximum in 10 Years (10-yr Return Level) - (MM)' :'5-day_rain_sum_max_10_years_mm',
    'Expected 5-day Cumulative Rainfall Maximum in 25 Years (25-yr Return Level) - (MM)' :'5-day_rain_sum_max_25_years_mm',
    'Expected Largest Monthly Rainfall Amount in 25 Years (25-yr Return Level) - (MM)' :  'largest_month_rain_25_years_mm',
    'Expected Largest Monthly Rainfall Amount in 10 Years (10-yr Return Level) - (MM)' :  'largest_month_rain_10_years_mm',
    'Number of Days with Rainfall > 20mm - (Days)' :                                      'nb_days_with_rain_>_20mm',
    'Number of Days with Rainfall > 50mm - (Days)' :                                      'nb_days_with_rain_>_50mm',
    'Rainfall Amount from Very Wet Days - (Percentage)' :                                 'rain_from_very_wet_days_percent',
    'Month':'month'
}
df_pred = df_pred.rename(columns=rename)
df_pred.to_csv(f'{DATASET_FOLDER}projection_precipitation_clean_2020-12-01.csv', index=False)
df_pred.head(3)

100%|██████████| 197/197 [03:41<00:00,  1.12s/it]


Unnamed: 0,5-day_rain_sum_max_25_years_mm,year,model,statistics,ISO3,projection_rcp,daily_rain_max_25_years_mm,nb_days_with_rain_>_50mm,monthly_prcp_mm,Country,5-day_rain_sum_max_10_years_mm,largest_5-day_rain_sum_mm,nb_days_with_rain_>_20mm,rain_from_very_wet_days_percent,daily_rain_max_10_years_mm,largest_month_rain_25_years_mm,largest_single_day_rain_mm,largest_month_rain_10_years_mm,month
0,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp45,,0.0,,,,,,,,,,,1.0
1,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp45,,,,,,,,-0.3558,,,,,1.0
2,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp26,,,,,,,,-0.3257,,,,,1.0


In [19]:
df_pred = df_pred.rename(columns=rename)
df_pred.to_csv(f'{DATASET_FOLDER}projection_precipitation_clean_2020-12-01.csv', index=False)
df_pred.head(3)

Unnamed: 0,5-day_rain_sum_max_25_years_mm,year,model,statistics,ISO3,projection_rcp,daily_rain_max_25_years_mm,nb_days_with_rain_>_50mm,monthly_prcp_mm,Country,5-day_rain_sum_max_10_years_mm,largest_5-day_rain_sum_mm,nb_days_with_rain_>_20mm,rain_from_very_wet_days_percent,daily_rain_max_10_years_mm,largest_month_rain_25_years_mm,largest_single_day_rain_mm,largest_month_rain_10_years_mm,month
0,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp45,,0.0,,,,,,,,,,,1.0
1,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp45,,,,,,,,-0.3558,,,,,1.0
2,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp26,,,,,,,,-0.3257,,,,,1.0


In [20]:
len(df_pred)

6039651

In [21]:
df_pred

Unnamed: 0,5-day_rain_sum_max_25_years_mm,year,model,statistics,ISO3,projection_rcp,daily_rain_max_25_years_mm,nb_days_with_rain_>_50mm,monthly_prcp_mm,Country,5-day_rain_sum_max_10_years_mm,largest_5-day_rain_sum_mm,nb_days_with_rain_>_20mm,rain_from_very_wet_days_percent,daily_rain_max_10_years_mm,largest_month_rain_25_years_mm,largest_single_day_rain_mm,largest_month_rain_10_years_mm,month
0,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp45,,0.0,,,,,,,,,,,1.0
1,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp45,,,,,,,,-0.3558,,,,,1.0
2,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp26,,,,,,,,-0.3257,,,,,1.0
3,,2020-2039,Ensemble (10th Percentile),Jan Average,DZA,rcp26,,,-3.275049,Algeria,,,,,,,,,1.0
4,,2020-2039,Ensemble (10th Percentile),Jan Anomaly,DZA,rcp60,,,,,,-1.964,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33143,,2080-2099,noresm1_m,Annual Anomaly,RUS,rcp85,,,,,9.55063,,,,,,,,
33144,,2080-2099,noresm1_m,Annual Anomaly,RUS,rcp60,,,,,,,,,2.5989,,,,
33145,,2080-2099,noresm1_m,Annual Anomaly,RUS,rcp45,,,,,,,,13.9125,,,,,
33146,,2080-2099,noresm1_m,Annual Anomaly,RUS,rcp45,,,,,,,,,,,1.767,,


In [22]:
#dict(df_pred['ISO3'].value_counts())

In [23]:
df_pred = pd.read_csv(f'{DATASET_FOLDER}projection_precipitation_clean_2020-12-01.csv', low_memory=False)


In [24]:
df_pred.query("ISO3 == 'FRA' and projection_rcp=='rcp26' and year=='2020-2039' and model=='bcc_csm1_1'")

Unnamed: 0,5-day_rain_sum_max_25_years_mm,year,model,statistics,ISO3,projection_rcp,daily_rain_max_25_years_mm,nb_days_with_rain_>_50mm,monthly_prcp_mm,Country,5-day_rain_sum_max_10_years_mm,largest_5-day_rain_sum_mm,nb_days_with_rain_>_20mm,rain_from_very_wet_days_percent,daily_rain_max_10_years_mm,largest_month_rain_25_years_mm,largest_single_day_rain_mm,largest_month_rain_10_years_mm,month
4230361,,2020-2039,bcc_csm1_1,Jan Anomaly,FRA,rcp26,,,,,,,,3.83886,,,,,1.0
4230368,,2020-2039,bcc_csm1_1,Jan Anomaly,FRA,rcp26,,,,,,,,,,,1.19066,,1.0
4230372,,2020-2039,bcc_csm1_1,Jan Average,FRA,rcp26,,,103.5502,France,,,,,,,,,1.0
4230374,,2020-2039,bcc_csm1_1,Jan Anomaly,FRA,rcp26,,,,,,1.29332,,,,,,,1.0
4230378,,2020-2039,bcc_csm1_1,Jan Anomaly,FRA,rcp26,,,,,,,-0.01540,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4230622,,2020-2039,bcc_csm1_1,Annual Anomaly,FRA,rcp26,,,,,2.44999,,,,,,,,
4230630,,2020-2039,bcc_csm1_1,Annual Anomaly,FRA,rcp26,,,,,,1.72298,,,,,,,
4230635,,2020-2039,bcc_csm1_1,Annual Anomaly,FRA,rcp26,,,,,,,,,,-5.5609,,,
4230636,,2020-2039,bcc_csm1_1,Annual Anomaly,FRA,rcp26,,,,,,,0.12619,,,,,,
