# Build features for projection

In [1]:
# For multiple output per cell
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
# DATASET_FOLDER = '/media/data-nvme/dev/datasets/WorldBank/'
DATASET_FOLDER = "../../datasets/"

SPARK_MASTER = "spark://192.168.0.9:7077"
APP_NAME = "Build features for projection"
input_folder = DATASET_FOLDER
output = DATASET_FOLDER + "../wb_gkp_precipitation"

In [3]:
import pandas as pd

In [66]:
df = pd.read_csv(
    f"{DATASET_FOLDER}projection_preciptation_yearly_merged-2020-12-02.csv.gz"
)
df.head(1)

Unnamed: 0,year,month,model,statistics,ISO3,projection_rcp,daily_rain_max_25_years_mm,daily_rain_max_10_years_mm,5-day_rain_sum_max_10_years_mm,5-day_rain_sum_max_25_years_mm,largest_month_rain_10_years_mm,largest_month_rain_25_years_mm
0,2020-2039,,Ensemble (10th Percentile),Annual Anomaly,ARG,rcp60,-19.541,-16.471,-31.162,-36.832,-84.802,-103.96


### Keep ony one climate model

In [67]:
# df.model.unique()

In [68]:
df = df[df["model"] == "ipsl_cm5a_mr"]

### Keep only the minimum features

In [69]:
# df.columns

In [70]:
df = df[
    [
        "year",
        "ISO3",
        "projection_rcp",
        "5-day_rain_sum_max_10_years_mm",
        "5-day_rain_sum_max_25_years_mm",
    ]
]

In [71]:
# df.head(3)

### Compute rare precipitation events 

We will compute an occurence of rare precipitation events.

- Every random 10 years we will consider that there will be a flood due to rainfall corresponding to the projected Maximum 5-day Rainfall (10-yr RL)
- Every random 20 years we will consider that there will be a flood due to rainfall corresponding to the projected Maximum 5-day Rainfall (25-yr RL) 20 and not 25 because we have period of 20 years so it is easier. TODO : compute 25 yr RL

In [72]:
df_10y = df[["year", "ISO3", "projection_rcp", "5-day_rain_sum_max_10_years_mm"]]
df_25y = df[["year", "ISO3", "projection_rcp", "5-day_rain_sum_max_25_years_mm"]]

In [73]:
# df_10y.sort_values(by=['ISO3', 'projection_rcp', 'year']).head(5)

In [74]:
def random_year(year):
    return int(str(int(year[:3]) + random.randint(0, 1)) + str(random.randint(0, 9)))


df_20y_new = df_25y.copy()
df_20y_new["year"] = df_25y.year.apply(random_year)
df_20y_new = df_20y_new.rename(
    columns={"5-day_rain_sum_max_25_years_mm": "rain_anomaly"}
)

In [77]:
df_25y[(df_25y.projection_rcp == "rcp26") & (df_25y.ISO3 == "AFG")].sort_values(
    by=["ISO3", "projection_rcp", "year"]
).head(15)

Unnamed: 0,year,ISO3,projection_rcp,5-day_rain_sum_max_25_years_mm
61528,2020-2039,AFG,rcp26,13.5017
22146,2040-2059,AFG,rcp26,28.9298
28831,2060-2079,AFG,rcp26,16.2455
7898,2080-2099,AFG,rcp26,10.1975


In [75]:
df_20y_new[
    (df_20y_new.projection_rcp == "rcp26") & (df_20y_new.ISO3 == "AFG")
].sort_values(by=["ISO3", "projection_rcp", "year"]).head(3)

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly
61528,2038,AFG,rcp26,13.5017
22146,2053,AFG,rcp26,28.9298
28831,2075,AFG,rcp26,16.2455


In [76]:
import random

df_10y_new = pd.DataFrame()
for i, row in df_10y.iterrows():
    # Create an event on the first decade
    # Create a random year of the event
    year = row["year"][:3] + str(random.randint(0, 9))
    # Build the new entry
    new_row = {
        "year": year,
        "ISO3": row["ISO3"],
        "projection_rcp": row.projection_rcp,
        "rain_anomaly": row["5-day_rain_sum_max_10_years_mm"],
    }
    # Add the entry
    df_10y_new = df_10y_new.append(new_row, ignore_index=True)
    # Create a second event on the next decade
    year = str(int(row["year"][:3]) + 1) + str(random.randint(0, 9))
    new_row = {
        "year": year,
        "ISO3": row["ISO3"],
        "projection_rcp": row.projection_rcp,
        "rain_anomaly": row["5-day_rain_sum_max_10_years_mm"],
    }
    df_10y_new = df_10y_new.append(new_row, ignore_index=True)
# df_10y_new.head(10)

In [78]:
# df_20y_new[(df_20y_new.projection_rcp == 'rcp26') & (df_20y_new.ISO3 == 'AFG')].sort_values(by=['ISO3', 'projection_rcp', 'year']).head(15)

### Concatenate the 2 DF

In [138]:
df_projection = df_20y_new.append(df_10y_new)
print(
    len(df_20y_new),
    "+",
    len(df_10y_new),
    "=",
    len(df_projection),
    "predictions to make",
)

3117 + 6234 = 9351 predictions to make


In [139]:
df_projection.sort_values(by=["ISO3", "projection_rcp", "year"]).head(3)

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly
61528,2038,AFG,rcp26,13.5017
22146,2053,AFG,rcp26,28.9298
28831,2075,AFG,rcp26,16.2455


In [140]:
df_projection[df_projection.rain_anomaly.isna()]

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly
10978,2089,EST,rcp85,
29600,2077,ERI,rcp60,
50627,2044,EST,rcp45,
72,2024,COD,rcp85,
73,2039,COD,rcp85,
...,...,...,...,...
5777,2099,ROU,rcp85,
5894,2064,SSD,rcp45,
5895,2077,SSD,rcp45,
6214,2064,COD,rcp45,


In [145]:
df_projection.dropna(inplace=True)

In [146]:
df_projection[df_projection.rain_anomaly.isna()].ISO3.unique()

array([], dtype=object)

# Compute the normal rainfall value

In [147]:
df_hist = pd.read_csv(f"{DATASET_FOLDER}historical_precipitation_clean_2020-12-02.csv")
df_hist.head(1)

Unnamed: 0,Rainfall - (MM),Year,Statistics,Country,ISO3,Month
0,73.9679,1901,Jan Average,Liechtenstein,LIE,1


In [148]:
df_tmp = df_hist.query("(Year >= 1986) and (Year <= 2005)")

In [149]:
df_tmp = df_tmp[["ISO3", "Rainfall - (MM)"]]
# Group by country, compute mean
# len(df_tmp)
df_hist_normal = df_tmp.groupby(["ISO3"]).mean()
# df_hist_normal.columns = ['ISO3', 'avg_rain']
df_hist_normal = df_hist_normal.rename(
    columns={"Rainfall - (MM)": "avg_hist_rain"}
).reset_index()

In [150]:
df_hist_normal.head(2)

Unnamed: 0,ISO3,avg_hist_rain
0,AFG,26.572291
1,AGO,79.961934


In [151]:
df_hist_normal[df_hist_normal.isna().any(axis=1)]

Unnamed: 0,ISO3,avg_hist_rain


## Merge with projection

In [152]:
df_projection_merged = df_projection.join(
    df_hist_normal.set_index("ISO3"), on="ISO3", how="inner"
)
df_projection_merged["est_max_rain"] = (
    df_projection_merged["rain_anomaly"] + df_projection_merged["avg_hist_rain"]
)
df_projection_merged

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly,avg_hist_rain,est_max_rain
69,2031,JOR,rcp26,-9.74730,9.007923,-0.739377
5235,2081,JOR,rcp26,-10.25700,9.007923,-1.249077
12682,2059,JOR,rcp60,-3.76900,9.007923,5.238923
17343,2072,JOR,rcp26,0.57577,9.007923,9.583693
22328,2096,JOR,rcp45,-3.55980,9.007923,5.448123
...,...,...,...,...,...,...
5947,2053,CZE,rcp60,10.40800,55.081412,65.489412
6056,2064,CZE,rcp85,2.48093,55.081412,57.562342
6057,2073,CZE,rcp85,2.48093,55.081412,57.562342
6216,2063,CZE,rcp60,10.00710,55.081412,65.088512


In [153]:
df_projection_merged[df_projection_merged.isna().any(axis=1)].ISO3.unique()

array([], dtype=object)

In [154]:
df_projection_merged[df_projection_merged.rain_anomaly.isna()]

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly,avg_hist_rain,est_max_rain


In [157]:
df_projection_merged.sort_values(
    by=["est_max_rain", "projection_rcp", "ISO3", "year"], ascending=False
).head(5)

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly,avg_hist_rain,est_max_rain
55069,2091,BDI,rcp85,571.248,100.686849,671.934849
52565,2063,MDV,rcp85,467.331,197.083333,664.414333
25677,2070,MDV,rcp60,406.364,197.083333,603.447333
3315,2096,MDV,rcp85,348.766,197.083333,545.849333
70656,2093,BRN,rcp45,282.993,258.913422,541.906422


In [158]:
df_projection_merged.sort_values(
    by=["est_max_rain", "projection_rcp", "ISO3", "year"], ascending=True
).head(5)

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly,avg_hist_rain,est_max_rain
36380,2020,DJI,rcp85,-32.713,21.455127,-11.257873
65985,2097,EGY,rcp85,-7.3951,2.681223,-4.713877
48485,2098,JOR,rcp60,-13.23,9.007923,-4.222077
26729,2033,EGY,rcp60,-6.2883,2.681223,-3.607077
6804,2035,EGY,rcp26,-5.7466,2.681223,-3.065377


## Save

In [159]:
outfilename = (
    f"{DATASET_FOLDER}projection_precipitation_for_prediction-2020-12-10.csv.gz"
)
df_projection_merged.to_csv(outfilename, index=False, compression="gzip")

In [160]:
df_tmp = pd.read_csv(outfilename)
df_tmp.head(3)

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly,avg_hist_rain,est_max_rain
0,2031,JOR,rcp26,-9.7473,9.007923,-0.739377
1,2081,JOR,rcp26,-10.257,9.007923,-1.249077
2,2059,JOR,rcp60,-3.769,9.007923,5.238923


In [161]:
df_tmp

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly,avg_hist_rain,est_max_rain
0,2031,JOR,rcp26,-9.74730,9.007923,-0.739377
1,2081,JOR,rcp26,-10.25700,9.007923,-1.249077
2,2059,JOR,rcp60,-3.76900,9.007923,5.238923
3,2072,JOR,rcp26,0.57577,9.007923,9.583693
4,2096,JOR,rcp45,-3.55980,9.007923,5.448123
...,...,...,...,...,...,...
9197,2053,CZE,rcp60,10.40800,55.081412,65.489412
9198,2064,CZE,rcp85,2.48093,55.081412,57.562342
9199,2073,CZE,rcp85,2.48093,55.081412,57.562342
9200,2063,CZE,rcp60,10.00710,55.081412,65.088512


In [162]:
df_tmp[df_tmp.isna().any(axis=1)]

Unnamed: 0,year,ISO3,projection_rcp,rain_anomaly,avg_hist_rain,est_max_rain
