# Export for Dashboard

In [30]:
# For multiple output per cell
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [31]:
# DATASET_FOLDER = '/media/data-nvme/dev/datasets/WorldBank/'
DATASET_FOLDER = "../../datasets/precipitation/"

SPARK_MASTER = "spark://192.168.0.9:7077"
APP_NAME = "Merge PRCP GKP"
input_folder = DATASET_FOLDER
output = DATASET_FOLDER + "../wb_gkp_precipitation"

In [32]:
import os
import pandas as pd

In [33]:
import pandas as pd
from pyspark import SparkContext

# from pyspark import SparkConf
from pyspark.sql import SparkSession

# from pyspark.sql.window import Window
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

import shutil

In [72]:
### Connect to Spark
print("Create Spark session")
spark = SparkSession.builder.master(SPARK_MASTER).appName(APP_NAME).getOrCreate()
sc = spark.sparkContext

Create Spark session


# History

### Load EMDAT

In [35]:
#!ls /media/data-nvme/dev/datasets/WorldBank/

In [36]:
# Load EMDAT
emdat = (
    spark.read.format("csv")
    .option("header", True)
    .option("multiLine", True)
    .load(
        "/media/data-nvme/dev/datasets/WorldBank/"
        + "emdat_public_2020_09_12_query_uid-tAnKEX-floods_only.csv"
    )
)

In [49]:
print(emdat.columns)

['Dis No', 'Year', 'Seq', 'Disaster Subtype', 'Disaster Subsubtype', 'Event Name', 'Entry Criteria', 'Country', 'ISO', 'Region', 'Continent', 'Location', 'Origin', 'Associated Dis', 'Associated Dis2', 'OFDA Response', 'Appeal', 'Declaration', 'Aid Contribution', 'Dis Mag Value', 'Dis Mag Scale', 'Latitude', 'Longitude', 'Local Time', 'River Basin', 'Total Deaths', 'No Injured', 'No Affected', 'No Homeless', 'Total Affected', "Reconstruction Costs ('000 US$)", "Insured Damages ('000 US$)", "Total Damages ('000 US$)", 'CPI', 'date', 'decade']


In [None]:
# emdat = emdat.withColumn('decade',  F.concat(F.col('Year').substr(0, 3) , F.lit('0-') , F.col('Year').substr(0, 3), F.lit('9')))
emdat = emdat.withColumn("decade", F.concat(F.col("Year").substr(0, 3), F.lit("0")))
# emdat.take(3)

In [40]:
###### Cast Type
print("Cast string to float")
emdat = emdat.withColumn(
    "Total Deaths", emdat["Total Deaths"].cast(FloatType())
).withColumn(
    "Total Damages ('000 US$)", emdat["Total Damages ('000 US$)"].cast(FloatType())
)
emdat.createOrReplaceTempView("emdat")

Cast string to float


In [41]:
emdat.count()

5226

In [57]:
# Agregate by decade and country
emdat_agregate = spark.sql(
    """
SELECT decade as Decade, ISO, region as UN_Geosheme_Subregion, 'Flood' as Disaster_Type, 'past' as RCP,
    sum(`Total Damages ('000 US$)`) as Financial_Impact, sum(`Total Deaths`) as Human_Impact,
    count(`Dis No`) as DO
FROM emdat
GROUP BY decade, region, ISO
ORDER BY decade, region, ISO
"""
)

dfp = emdat_agregate.toPandas()
dfp.dropna().head(3)

Unnamed: 0,Decade,ISO,UN_Geosheme_Subregion,Disaster_Type,RCP,Financial_Impact,Human_Impact,DO
5,1920,USA,Northern America,Flood,past,230.0,246.0,1
10,1930,CHN,Eastern Asia,Flood,past,1400000.0,4200000.0,2
12,1930,USA,Northern America,Flood,past,438000.0,337.0,2


### Load Rainfall

In [16]:
# Load Rainfall
DATASET_FOLDER = "../../datasets/WorldBank"
rain = (
    spark.read.format("csv")
    .option("header", True)
    .option("multiLine", True)
    .load(f"{DATASET_FOLDER}/daily_rain_by_country_feature.csv.gz")
)

In [59]:
# Create an agregate of rain by decade and country
rain.createOrReplaceTempView("noaa")
noaa = spark.sql(
    """
    SELECT decade, country_ISO3, count(avg_rain) as `nb_days_with_rain_>_50mm` FROM noaa
    WHERE avg_rain > 50
    GROUP BY decade, country_ISO3
    ORDER BY decade, country_ISO3
"""
)
noaa.createOrReplaceTempView("noaa_agregate")

## Join EMDAT and Rainfall

In [60]:
# Join EMDAT agregate and rain agregate
emdat_agregate.createOrReplaceTempView("emdat_agregate")
emdat_rain_agregate = spark.sql(
    """
    SELECT emdat_agregate.*, noaa_agregate.`nb_days_with_rain_>_50mm`
    FROM emdat_agregate LEFT JOIN noaa_agregate ON emdat_agregate.decade = noaa_agregate.decade
        AND emdat_agregate.ISO = noaa_agregate.country_ISO3
"""
)
emdat_rain_agregate.show(3)

+------+---+---------------------+-------------+----+----------------+------------+---+------------------------+
|Decade|ISO|UN_Geosheme_Subregion|Disaster_Type| RCP|Financial_Impact|Human_Impact| DO|nb_days_with_rain_>_50mm|
+------+---+---------------------+-------------+----+----------------+------------+---+------------------------+
|  1960|VEN|        South America|        Flood|past|          4126.0|        null|  1|                    1438|
|  1980|PER|        South America|        Flood|past|          7200.0|       783.0|  9|                    1110|
|  1990|ZWE|       Eastern Africa|        Flood|past|            null|        36.0|  1|                     876|
+------+---+---------------------+-------------+----+----------------+------------+---+------------------------+
only showing top 3 rows



In [66]:
# Agregate by region
emdat_rain_agregate.createOrReplaceTempView("emdat_rain_gregate")
emdat_rain_agregate_by_country = spark.sql(
    """
SELECT Decade, UN_Geosheme_Subregion, Disaster_Type, RCP, sum(Financial_Impact) as Financial_Impact, sum(Human_Impact) as Human_Impact, sum(DO) as DO, sum(`nb_days_with_rain_>_50mm`) as `nb_days_with_rain_>_50mm`
FROM emdat_rain_gregate
GROUP BY Decade, UN_Geosheme_Subregion, Disaster_Type, RCP
ORDER BY Decade, UN_Geosheme_Subregion
"""
)

+------+---------------------+-------------+----+----------------+------------+---+------------------------+
|Decade|UN_Geosheme_Subregion|Disaster_Type| RCP|Financial_Impact|Human_Impact| DO|nb_days_with_rain_>_50mm|
+------+---------------------+-------------+----+----------------+------------+---+------------------------+
|  1900|            Caribbean|        Flood|past|             0.0|       300.0|  1|                    null|
|  1900|     Northern America|        Flood|past|             0.0|        72.0|  1|                     626|
|  1900|       Western Europe|        Flood|past|             0.0|         6.0|  2|                     952|
+------+---------------------+-------------+----+----------------+------------+---+------------------------+
only showing top 3 rows



In [67]:
# Filling NA/null with 0
emdat_rain_agregate_by_country = emdat_rain_agregate_by_country.fillna(
    {"Financial_Impact": 0, "Human_Impact": 0, "nb_days_with_rain_>_50mm": 0}
)
emdat_rain_agregate_by_country.show(3)

+------+---------------------+-------------+----+----------------+------------+---+------------------------+
|Decade|UN_Geosheme_Subregion|Disaster_Type| RCP|Financial_Impact|Human_Impact| DO|nb_days_with_rain_>_50mm|
+------+---------------------+-------------+----+----------------+------------+---+------------------------+
|  1900|            Caribbean|        Flood|past|             0.0|       300.0|  1|                       0|
|  1900|     Northern America|        Flood|past|             0.0|        72.0|  1|                     626|
|  1900|       Western Europe|        Flood|past|             0.0|         6.0|  2|                     952|
+------+---------------------+-------------+----+----------------+------------+---+------------------------+
only showing top 3 rows



In [194]:
# dfp = emdat.toPandas()

In [195]:
# dfp[['Dis No', "Total Deaths"]]

In [196]:
# dfp[dfp['Dis No'] == '1906-0023-BEL'] #") #1906-0023-BEL 	1950-0007-CHN

In [69]:
# Check that wr have good data
emdat_rain_agregate_by_country.createOrReplaceTempView("emdat_rain_agregate_by_country")

spark.sql(
    """
SELECT *
FROM emdat_rain_agregate_by_country
WHERE Financial_Impact > 0
ORDER BY Financial_Impact
"""
).show(3)

+------+---------------------+-------------+----+----------------+------------+---+------------------------+
|Decade|UN_Geosheme_Subregion|Disaster_Type| RCP|Financial_Impact|Human_Impact| DO|nb_days_with_rain_>_50mm|
+------+---------------------+-------------+----+----------------+------------+---+------------------------+
|  1970|      Southern Africa|        Flood|past|            50.0|        56.0|  4|                    1172|
|  1990|        Middle Africa|        Flood|past|            59.0|       146.0| 19|                    6410|
|  1920|     Northern America|        Flood|past|           230.0|       246.0|  1|                     418|
+------+---------------------+-------------+----+----------------+------------+---+------------------------+
only showing top 3 rows



In [70]:
# Saving in one file with Pandas
dfp = emdat_rain_agregate_by_country.toPandas()
dfp
dfp.to_csv(
    f"{DATASET_FOLDER}../flood_history_agregates.csv.gz",
    index=False,
    compression="gzip",
)

Unnamed: 0,Decade,UN_Geosheme_Subregion,Disaster_Type,RCP,Financial_Impact,Human_Impact,DO,nb_days_with_rain_>_50mm
0,1900,Caribbean,Flood,past,0.0,300.0,1,0
1,1900,Northern America,Flood,past,0.0,72.0,1,626
2,1900,Western Europe,Flood,past,0.0,6.0,2,952
3,1910,Eastern Asia,Flood,past,0.0,1379.0,1,0
4,1920,Northern Africa,Flood,past,0.0,3000.0,1,686
...,...,...,...,...,...,...,...,...
161,2020,Southern Asia,Flood,past,1300000.0,1870.0,20,978
162,2020,Southern Europe,Flood,past,28000.0,0.0,2,170
163,2020,Western Africa,Flood,past,0.0,63.0,3,400
164,2020,Western Asia,Flood,past,0.0,201.0,5,138


# Projection

In [6]:
# rain = spark.read.format('csv').option('header',True).option('multiLine', True) \
#     .load(f'{DATASET_FOLDER}../projection_preciptation_monthly_merged-2020-12-02')

In [73]:
sc.stop()