# Cleaning Global Knowledge Portal Data

In [50]:
# For multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [51]:
#DATASET_FOLDER = '/media/data-nvme/dev/datasets/WorldBank/'
DATASET_FOLDER = '../../datasets/'

In [52]:
import os
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import glob
import traceback
import sys
import numpy as np

In [53]:
rcp_projection = ['rcp26', 'rcp45','rcp60', 'rcp85']

def abreviation2nombre(abr):
        lst_abr = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        return(lst_abr.index(abr)+1)

def read_onefile(filename):
    country_error_search_string = ['The', 'State of', 'United Republic of', 'Democratic People’s Republic of', 'Republic of']
    df = pd.read_csv(filename, sep=r', ', engine='python')
    if 'historical' in filename:
        # S'il y a un problème de vigule on nettoit le dataframe
        if 'Country' in df.columns and df.Country.all() in country_error_search_string:
            df.reset_index(inplace=True)
            # On récuppère le nom réel
            df['new_Country'] = df[['Statistics', 'Country']].apply(lambda x: x[0]+', '+x[1], axis=1)
            df.drop('Country', axis=1, inplace=True)
            df.rename(columns={'index': 'Rainfall - (MM)',
                               'Rainfall - (MM)': 'Year',
                               'Year': 'Statistics', 
                               'Statistics': 'tmp',
                               'new_Country': 'Country'}, inplace=True)
            df.drop('tmp', axis=1, inplace=True)
    else:
        for rcp in rcp_projection:
            if rcp in filename:
                df['RCP'] = rcp
        #print(entry.name, ':', df.columns)
        # S'il y a un problème de vigule on nettoit le dataframe
        if 'Country' in df.columns and df.Country.all() in country_error_search_string:
            df.reset_index(inplace=True)
            # On récuppère le nom réel
            df['new_Country'] = df[['Statistics', 'Country']].apply(lambda x: x[0]+', '+x[1], axis=1)
            df.drop('Country', axis=1, inplace=True)

            df.rename(columns={'index': 'Monthly Precipitation - (MM)',
                               'Monthly Precipitation - (MM)': 'Year',
                               'Year': 'Model', 
                               'Model': 'Statistics',
                               'Statistics': 'tmp',
                               'new_Country': 'Country'}, inplace=True)
            df.drop('tmp', axis=1, inplace=True)    
    return df

In [54]:
def gen_dataset_country(country):
    '''Créer les deux fichiers : historique_precipitation_clean.csv et projection_precipitation_clean.csv
    '''
    df_hist = pd.DataFrame()
    df_pred = pd.DataFrame()
    for filename in glob.glob(DATASET_FOLDER + 'precipitation/*'+country+'*.csv'): 
        try:
            df = read_onefile(filename)
            if 'historical' in filename:
                df_hist = df_hist.append(df)
            else:
                df_pred = df_pred.append(df)
        except Exception as err:
            print('ERROR reading', filename)
            traceback.print_tb(err.__traceback__)
    # Extract Month number for History
    df_hist['Month'] = df_hist['Statistics'].str[:4].apply(lambda x: abreviation2nombre(x.strip()))
    df_hist.sort_values(['Country','Year','Month'], inplace=True)
    df_hist.to_csv(f'{DATASET_FOLDER}historical_precipitation/historical_precipitation_clean_' + country + '.csv', index=False)
    # Extract Month number for projection
    df_pred['Month'] = df_pred['Statistics'].str[:4].apply(lambda x: abreviation2nombre(x.strip()) if x != 'Annu' else np.NaN)
    df_pred.sort_values(['Country','Year','Model','Month'], inplace=True)
    df_pred.to_csv(f'{DATASET_FOLDER}projection_precipitation/projection_precipitation_clean_' + country + '.csv', index=False)
    return country

In [55]:
#gen_dataset_country('FRA')

'FRA'

In [None]:
df = pd.read_csv(DATASET_FOLDER + 'worldbank_countries.csv')
countries_code = df.code.to_list()
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
    futures = []
    for iso3 in countries_code:
        futures.append(executor.submit(gen_dataset_country, country=iso3))
    for future in concurrent.futures.as_completed(futures):
        print(f'Done {future.result()}')

In [58]:
### Check

In [59]:
fra = pd.read_csv(DATASET_FOLDER + 'historical_precipitation/historical_precipitation_clean_FRA.csv')
fra.head(3)
del fra

Unnamed: 0,Rainfall - (MM),Year,Statistics,Country,ISO3,Month
0,40.929,1901,Jan Average,France,FRA,1
1,34.7865,1901,Feb Average,France,FRA,2
2,90.9714,1901,Mar Average,France,FRA,3


In [60]:
fra = pd.read_csv(DATASET_FOLDER + 'projection_precipitation/projection_precipitation_clean_FRA.csv')
fra.head(3)
del fra

Unnamed: 0,Expected 5-day Cumulative Rainfall Maximum in 10 Years (10-yr Return Level) - (MM),Year,Model,Statistics,ISO3,RCP,Rainfall Amount from Very Wet Days - (Percentage),Largest 5-day Cumulative Rainfall - (MM),Expected Daily Rainfall Maximum in 25 Years (25-yr Return Level) - (MM),Expected Largest Monthly Rainfall Amount in 10 Years (10-yr Return Level) - (MM),Number of Days with Rainfall > 50mm - (Days),Largest Single Day Rainfall - (MM),Expected Daily Rainfall Maximum in 10 Years (10-yr Return Level) - (MM),Expected Largest Monthly Rainfall Amount in 25 Years (25-yr Return Level) - (MM),Number of Days with Rainfall > 20mm - (Days),Monthly Precipitation - (MM),Country,Month
0,,2020-2039,Ensemble (10th Percentile),Jan Average,FRA,rcp85,,,,,,,,,,-13.939564,France,1.0
1,,2020-2039,Ensemble (10th Percentile),Jan Average,FRA,rcp45,,,,,,,,,,-16.419842,France,1.0
2,,2020-2039,Ensemble (10th Percentile),Jan Average,FRA,rcp60,,,,,,,,,,-12.64255,France,1.0


## Merge all files

### Historical

In [64]:
df_hist = pd.DataFrame()
for filename in glob.glob(DATASET_FOLDER + 'historical_precipitation/*.csv'):
    df = pd.read_csv(filename)
    df_hist = df_hist.append(df)
df_hist.to_csv(f'{DATASET_FOLDER}../historical_precipitation_clean_2020-12-01.csv', index=False)

In [63]:
df_pred.head(3)

Unnamed: 0,Rainfall - (MM),Year,Statistics,Country,ISO3,Month
0,73.9679,1901,Jan Average,Liechtenstein,LIE,1
1,64.055,1901,Feb Average,Liechtenstein,LIE,2
2,208.607,1901,Mar Average,Liechtenstein,LIE,3


### Projection

In [67]:
df_pred = pd.DataFrame()
for filename in tqdm(glob.glob(DATASET_FOLDER + 'projection_precipitation/*.csv')):
    df = pd.read_csv(filename, low_memory=False)
    df_pred = df_pred.append(df)
rename = ['monthly_prcp_mm', 'year', 'model', 'statistics',
       'country', 'ISO3', 'projection_rcp',
       'daily_rain_max_25_years_mm',
       'largest_single_day_rain_mm',
       'daily_rain_max_10_years_mm',
       '5-day_rain_sum_max_10_years_mm',
       'rain_from_very_wet_days_percent',
       'largest_5-day_rain_sum_mm',
       'largest_month_rain_25_years_mm',
       'nb_days_with_rain_>_50mm',
       'nb_days_with_rain_>_20mm',
       'largest_month_rain_10_years_mm',
       'month']
df_pred.columns = rename
df_pred.to_csv(f'{DATASET_FOLDER}../projection_precipitation_clean_2020-12-01.csv', index=False)
df_pred.head(3)

100%|██████████| 192/192 [02:47<00:00,  1.15it/s]


Unnamed: 0,monthly_prcp_mm,year,model,statistics,country,ISO3,projection_rcp,daily_rain_max_25_years_mm,largest_single_day_rain_mm,daily_rain_max_10_years_mm,5-day_rain_sum_max_10_years_mm,rain_from_very_wet_days_percent,largest_5-day_rain_sum_mm,largest_month_rain_25_years_mm,nb_days_with_rain_>_50mm,nb_days_with_rain_>_20mm,largest_month_rain_10_years_mm,month
0,,2020-2039,Ensemble (10th Percentile),Jan Average,DZA,rcp26,,-3.275049,Algeria,,,,,,,,,1.0
1,,2020-2039,Ensemble (10th Percentile),Jan Average,DZA,rcp85,,-3.724133,Algeria,,,,,,,,,1.0
2,,2020-2039,Ensemble (10th Percentile),Jan Average,DZA,rcp45,,-3.196988,Algeria,,,,,,,,,1.0


In [68]:
len(df_pred)

5920646