# Build features for projection

In [1]:
# For multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#DATASET_FOLDER = '/media/data-nvme/dev/datasets/WorldBank/'
DATASET_FOLDER = '../../datasets/'

SPARK_MASTER = 'spark://192.168.0.9:7077'
APP_NAME = 'Build features for projection'
input_folder = DATASET_FOLDER
output = DATASET_FOLDER + '../wb_gkp_precipitation'

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(f'{DATASET_FOLDER}projection_preciptation_yearly_merged-2020-12-02.csv.gz')
df.head(1)

Unnamed: 0,year,month,model,statistics,ISO3,projection_rcp,daily_rain_max_25_years_mm,daily_rain_max_10_years_mm,5-day_rain_sum_max_10_years_mm,5-day_rain_sum_max_25_years_mm,largest_month_rain_10_years_mm,largest_month_rain_25_years_mm
0,2020-2039,,Ensemble (10th Percentile),Annual Anomaly,ARG,rcp60,-19.541,-16.471,-31.162,-36.832,-84.802,-103.96


### Keep ony one climate model

In [5]:
#df.model.unique()

In [6]:
df = df[df['model'] == 'ipsl_cm5a_mr']

### Keep only the minimum features

In [7]:
#df.columns

In [8]:
df = df[['year', 'ISO3', 'projection_rcp', '5-day_rain_sum_max_10_years_mm', '5-day_rain_sum_max_25_years_mm']]

In [9]:
#df.head(3)

### Compute rare precipitation events 

We will compute an occurence of rare precipitation events.

- Every random 10 years we will consider that there will be a flood due to rainfall corresponding to the projected Maximum 5-day Rainfall (10-yr RL)
- Every random 20 years we will consider that there will be a flood due to rainfall corresponding to the projected Maximum 5-day Rainfall (25-yr RL) 20 and not 25 because we have period of 20 years so it is easier. TODO : compute 25 yr RL

In [10]:
df_10y = df[['year', 'ISO3', 'projection_rcp', '5-day_rain_sum_max_10_years_mm']]
df_25y = df[['year', 'ISO3', 'projection_rcp', '5-day_rain_sum_max_25_years_mm']]

In [11]:
#df_10y.sort_values(by=['ISO3', 'projection_rcp', 'year']).head(5)

In [12]:
import random

df_10y_new = pd.DataFrame()
for i, row in df_10y.iterrows():
    # Create an event on the first decade
    # Create a random year of the event
    year = row['year'][:3] + str(random.randint(0, 9))
    # Build the new entry
    new_row = {'year' : year,
               'ISO3' : row['ISO3'],
               'projection_rcp' : row.projection_rcp,
               'rain': row['5-day_rain_sum_max_10_years_mm']}
    # Add the entry
    df_10y_new = df_10y_new.append(new_row, ignore_index=True)
    # Create a second event on the next decade
    year = str(int(row['year'][:3])+1) + str(random.randint(0, 9))
    new_row = {'year' : year,
           'ISO3' : row['ISO3'],
           'projection_rcp' : row.projection_rcp,
           'rain': row['5-day_rain_sum_max_10_years_mm']}
    df_10y_new = df_10y_new.append(new_row, ignore_index=True)
#df_10y_new.head(10)

In [13]:
df_25y[(df_25y.projection_rcp == 'rcp26') & (df_25y.ISO3 == 'AFG')].sort_values(by=['ISO3', 'projection_rcp', 'year']).head(15)

Unnamed: 0,year,ISO3,projection_rcp,5-day_rain_sum_max_25_years_mm
61528,2020-2039,AFG,rcp26,13.5017
22146,2040-2059,AFG,rcp26,28.9298
28831,2060-2079,AFG,rcp26,16.2455
7898,2080-2099,AFG,rcp26,10.1975


In [14]:
int(str(int("2020-2039"[:3])+random.randint(0, 1)) + str(random.randint(0, 9)))

2033

In [15]:

#df_20y_new[(df_20y_new.projection_rcp == 'rcp26') & (df_20y_new.ISO3 == 'AFG')].sort_values(by=['ISO3', 'projection_rcp', 'year']).head(15)

In [16]:
def random_year(year):
    return int(str(int(year[:3])+random.randint(0, 1)) + str(random.randint(0, 9)))

df_20y_new = df_25y.copy()
df_20y_new['year'] = df_25y.year.apply(random_year)
df_20y_new = df_20y_new.rename(columns={'5-day_rain_sum_max_25_years_mm': 'rain'})

In [17]:
df_20y_new[(df_20y_new.projection_rcp == 'rcp26') & (df_20y_new.ISO3 == 'AFG')].sort_values(by=['ISO3', 'projection_rcp', 'year']).head(3)

Unnamed: 0,year,ISO3,projection_rcp,rain
61528,2030,AFG,rcp26,13.5017
22146,2046,AFG,rcp26,28.9298
28831,2072,AFG,rcp26,16.2455


### Concatenate the 2 DF

In [18]:
df_projection = df_20y_new.append(df_10y_new)
print(len(df_20y_new), "+", len(df_10y_new), "=", len(df_projection), "predictions to make")

3117 + 6234 = 9351 predictions to make


In [19]:
df_projection.sort_values(by=['ISO3', 'projection_rcp', 'year']).head(3)

Unnamed: 0,year,ISO3,projection_rcp,rain
61528,2030,AFG,rcp26,13.5017
22146,2046,AFG,rcp26,28.9298
28831,2072,AFG,rcp26,16.2455


## Save

In [20]:
outfilename = f'{DATASET_FOLDER}projection_precipitation_for_prediction-2020-12-08.csv.gz'
df_projection.to_csv(outfilename, index=False, compression='gzip')

In [21]:
df_tmp = pd.read_csv(outfilename)
df_tmp.head(3)
del df_tmp

Unnamed: 0,year,ISO3,projection_rcp,rain
0,2029,JOR,rcp26,-9.7473
1,2034,MLI,rcp60,2.98533
2,2032,MLT,rcp60,0.63173
