# Data Processing for UI

 1. grid_id (Grid ID)
 2. central_lat (Central Latitude)
 3. central_lon (Central Longitude)
 4. total_event_count (Total Event Count)
 5. total_flood_event (Total Flood Events)
 6. earliest_event_year (Earliest Event Year)
 7. latest_event_year (Latest Event Year)
 8. total_damages (Total Damages)
 9. primary_admin1 (Primary Administrative Region)
 10. event_summary (Event Summary)

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

%cd drive/My Drive/mml_flood/

Mounted at /content/drive/
/content/drive/My Drive/mml_flood


### Read data

In [31]:
#gdis data - geocoded disasters
gdis = pd.read_csv('data/disaster/pend-gdis-1960-2018-disasterlocations.csv')
# #emdat data - international disasters
emdat = pd.read_csv('data/disaster/emdat_public_2022_09_21_query_uid-47Yzpr.csv', skiprows=[0,1,2,3,4,5])

  emdat = pd.read_csv('data/disaster/emdat_public_2022_09_21_query_uid-47Yzpr.csv', skiprows=[0,1,2,3,4,5])


  ## Filter emdat columns

In [32]:
emdat['disasterno'] = emdat['Dis No'].str[:-4]  # Remove last 4 characters "-XYZ" area code

In [33]:
emdat_cols = ['disasterno', 'Year', 'Event Name',
              'Start Year', 'Start Month', 'Start Day',
              'End Year', 'End Month', 'End Day',
              "Total Damages, Adjusted ('000 US$)",
              'Total Deaths', 'Total Affected',
              "Reconstruction Costs, Adjusted ('000 US$)",
              'Disaster Subtype', 'OFDA Response', 'River Basin']

# Subset EMDAT to available columns (ignore missing ones if not present)
emdat = emdat[[col for col in emdat_cols if col in emdat.columns]]
print(emdat['OFDA Response'].count())  # check available data
print(emdat['River Basin'].count())    # check available data

1716
1312


In [34]:
print(emdat.columns)

Index(['disasterno', 'Year', 'Event Name', 'Start Year', 'Start Month',
       'Start Day', 'End Year', 'End Month', 'End Day',
       'Total Damages, Adjusted ('000 US$)', 'Total Deaths', 'Total Affected',
       'Reconstruction Costs, Adjusted ('000 US$)', 'Disaster Subtype',
       'OFDA Response', 'River Basin'],
      dtype='object')


In [35]:
print(gdis.columns)

Index(['id', 'country', 'iso3', 'gwno', 'year', 'geo_id', 'geolocation',
       'level', 'adm1', 'adm2', 'adm3', 'location', 'historical',
       'hist_country', 'disastertype', 'disasterno', 'latitude', 'longitude'],
      dtype='object')


# Merged cleaned emdat with gdis on disasterno

In [38]:
merged_df = pd.merge(emdat, gdis, on='disasterno', how='right')
merged_df.shape

(82885, 33)

In [46]:
# Drop duplicate records based on GDIS 'id'
merged_df = merged_df.drop_duplicates(subset=['id'])
print("Merged Data Shape:", merged_df.shape)

Merged Data Shape: (9924, 39)


In [54]:
flood_df = merged_df[merged_df['disastertype'].str.lower() == 'flood'].copy()
print("Flood Events Shape:", flood_df.shape)

Flood Events Shape: (4274, 39)


# Create Grid IDs and Compute Duration


In [55]:
# Create grid_id from GDIS coordinates
flood_df['lat_grid'] = flood_df['latitude'].round().astype(int)
flood_df['lon_grid'] = flood_df['longitude'].round().astype(int)
flood_df['grid_id'] = flood_df['lat_grid'].astype(str) + "_" + flood_df['lon_grid'].astype(str)

# Central coordinates for each grid cell
flood_df['central_lat'] = flood_df['lat_grid']
flood_df['central_lon'] = flood_df['lon_grid']


In [56]:
def compute_duration(row):
    try:
        start_date = pd.Timestamp(year=int(row['Start Year']), month=int(row['Start Month']), day=int(row['Start Day']))
        end_date = pd.Timestamp(year=int(row['End Year']), month=int(row['End Month']), day=int(row['End Day']))
        return (end_date - start_date).days
    except Exception:
        return np.nan

flood_df['duration'] = flood_df.apply(compute_duration, axis=1)


In [57]:
flood_df['duration']

Unnamed: 0,duration
0,12.0
2,7.0
3,46.0
18,16.0
27,0.0
...,...
82550,4.0
82732,
82734,
82735,


# Aggregate Data by Grid Cell

---
Aggregate Data by Grid Cell (Flood Events Only):

Group the data by grid_id and compute the following parameters:

*total_event_count*: Count of flood events in the grid.

*earliest_event_year*: Minimum of the 'year' column.

*latest_event_year*: Maximum of the 'year' column.

*total_damages*: Sum of "Total Damages, Adjusted ('000 US$)".

*primary_admin1*: Most common administrative region (adm1).

*unique_locations*: Comma-separated list of unique location names.

*avg_duration*: Average duration (in days) of flood events.

#Additional parameters (if available):


*avg_damage_per_flood*: Total_damages divided by total_event_count.

*total_deaths*: Sum of "Total Deaths".

*total_affected*: Sum of "Total Affected".

*reconstruction_costs*: Sum of "Reconstruction Costs, Adjusted ('000 US$)".

*predominant_subtype*: Mode of "Disaster Subtype".

*ofda_response_count*: Count of non-null "OFDA Response".

*predominant_river_basin*: Mode of "River Basin".

*flood_recurrence_interval*: (latest_event_year - earliest_event_year)divided by total_event_count.


In [64]:
def safe_divide(numerator, denominator):
    return numerator / denominator if denominator and denominator != 0 else np.nan

In [63]:
# Aggregate all params
aggregated = flood_df.groupby('grid_id').agg(
    total_event_count = ('disasterno', 'count'),
    earliest_event_year = ('year', 'min'),
    latest_event_year = ('year', 'max'),
    total_damages = ("Total Damages, Adjusted ('000 US$)", 'sum'),
    central_lat = ('central_lat', 'first'),
    central_lon = ('central_lon', 'first'),
    primary_admin1 = ('adm1', lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan),
    unique_locations = ('location', lambda x: ', '.join(sorted(x.unique()))),
    avg_duration = ('duration', 'mean'),
    total_deaths = ('Total Deaths', 'sum') if 'Total Deaths' in flood_df.columns else (lambda x: np.nan),
    total_affected = ('Total Affected', 'sum') if 'Total Affected' in flood_df.columns else (lambda x: np.nan),
    reconstruction_costs = ("Reconstruction Costs, Adjusted ('000 US$)", 'sum') if "Reconstruction Costs, Adjusted ('000 US$)" in flood_df.columns else (lambda x: np.nan),
    predominant_subtype = ('Disaster Subtype', lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan) if 'Disaster Subtype' in flood_df.columns else (lambda x: np.nan),
    ofda_response_count = ('OFDA Response', lambda x: x.notnull().sum()) if 'OFDA Response' in flood_df.columns else (lambda x: np.nan),
    predominant_river_basin = ('River Basin', lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan) if 'River Basin' in flood_df.columns else (lambda x: np.nan)
).reset_index()

In [66]:
aggregated['total_flood_event'] = aggregated['total_event_count']

# Compute derived parameters:
aggregated['avg_damage_per_flood'] = aggregated.apply(
    lambda row: safe_divide(row['total_damages'], row['total_event_count']),
    axis=1
)
# Flood recurrence formula last-first divided by number of events in the interval
aggregated['flood_recurrence_interval'] = aggregated.apply(
    lambda row: safe_divide(row['latest_event_year'] - row['earliest_event_year'], row['total_event_count']),
    axis=1
)


# Text summary for events in each grid cell

In [74]:
aggregated['event_summary'] = aggregated.apply(
    lambda row: f"{row['total_event_count']} events ({row['total_flood_event']} floods) from {int(row['earliest_event_year'])} to {int(row['latest_event_year'])}, total damages: {row['total_damages'] if not pd.isna(row['total_damages']) else 'N/A'} ('000 US$)",
    axis=1
)


## There is a lot of missing data for $ costs; Replacing 0s with NaNs

In [75]:
# Replace zeros with NaN for cost-related columns in the aggregated DataFrame.
# This assumes that a 0 value for damages or reconstruction costs should be treated as missing data.
cost_columns = ['total_damages', 'reconstruction_costs']
for col in cost_columns:
    aggregated[col] = aggregated[col].replace(0, np.nan)

# Recompute the derived parameter 'avg_damage_per_flood' after the replacement.
aggregated['avg_damage_per_flood'] = aggregated.apply(
    lambda row: safe_divide(row['total_damages'], row['total_event_count']),
    axis=1
)

# Select and Order Final Columns

## We now keep only the desired columns:

 1. grid_id  
 2. central_lat  
 3. central_lon  
 4. total_event_count  
 5. earliest_event_year  
 6. latest_event_year  
 7. total_damages  
 8. primary_admin1  
 9. unique_locations  
 10. avg_duration  
 11. avg_damage_per_flood  
 12. total_deaths  
 13. total_affected  
 14. reconstruction_costs  
 15. predominant_subtype  
 16. ofda_response_count  
 17. predominant_river_basin  
 18. flood_recurrence_interval
 19. event_summary

In [76]:
final_columns = ['grid_id', 'central_lat', 'central_lon', 'total_event_count',
                 'earliest_event_year', 'latest_event_year',
                 'total_damages', 'primary_admin1', 'unique_locations', 'avg_duration',
                 'avg_damage_per_flood', 'total_deaths', 'total_affected',
                 'reconstruction_costs', 'predominant_subtype', 'ofda_response_count',
                 'predominant_river_basin', 'flood_recurrence_interval', 'event_summary']

final_df = aggregated[final_columns].copy()


In [77]:
print(final_df.head())

   grid_id  central_lat  central_lon  total_event_count  earliest_event_year  \
0  -10_-36          -10          -36                  1                 2004   
1  -10_-37          -10          -37                  5                 1967   
2  -10_-66          -10          -66                  1                 1987   
3  -10_124          -10          124                  1                 2010   
4   -10_14          -10           14                  2                 2004   

   latest_event_year  total_damages       primary_admin1  \
0               2004          531.0              Alagoas   
1               2009       777910.0              Alagoas   
2               1987            NaN                Pando   
3               2010            NaN  Nusa Tenggara Timur   
4               2005            NaN         Cuanza Norte   

             unique_locations  avg_duration  avg_damage_per_flood  \
0                      Maceio           1.0                 531.0   
1                   

# Save data to csv

In [83]:
final_df.to_csv('/content/drive/MyDrive/mml_flood/UI/flood_grid_summary.csv', index=False)
print("Aggregated flood grid summary data saved to '/content/drive/MyDrive/mml_flood/UI/flood_grid_summary.csv'.")

Aggregated flood grid summary data saved to '/content/drive/MyDrive/mml_flood/UI/flood_grid_summary.csv'.
