# Data Processing for UI

 1. grid_id (Grid ID)
 2. central_lat (Central Latitude)
 3. central_lon (Central Longitude)
 4. total_event_count (Total Event Count)
 5. total_flood_event (Total Flood Events)
 6. earliest_event_year (Earliest Event Year)
 7. latest_event_year (Latest Event Year)
 8. total_damages (Total Damages)
 9. primary_admin1 (Primary Administrative Region)
 10. event_summary (Event Summary)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

%cd drive/My Drive/mml_flood/

Mounted at /content/drive/
/content/drive/My Drive/mml_flood


### Read data

In [4]:
#gdis data - geocoded disasters
gdis = pd.read_csv('data/disaster/pend-gdis-1960-2018-disasterlocations.csv')
# #emdat data - international disasters
emdat = pd.read_csv('data/disaster/emdat_public_2022_09_21_query_uid-47Yzpr.csv', skiprows=[0,1,2,3,4,5])

  emdat = pd.read_csv('data/disaster/emdat_public_2022_09_21_query_uid-47Yzpr.csv', skiprows=[0,1,2,3,4,5])


  ### Filter emdat columns

In [5]:
emdat['disasterno'] = emdat['Dis No'].str[:-4]  # Remove last 4 characters "-XYZ" area code

In [6]:
emdat_cols = ['disasterno', 'Year', 'Event Name',
              'Start Year', 'Start Month', 'Start Day',
              'End Year', 'End Month', 'End Day',
              "Total Damages, Adjusted ('000 US$)",
              'Total Deaths', 'Total Affected',
              "Reconstruction Costs, Adjusted ('000 US$)",
              'Disaster Subtype', 'OFDA Response', 'River Basin']

# Subset EMDAT to available columns (ignore missing ones if not present)
emdat = emdat[[col for col in emdat_cols if col in emdat.columns]]
print(emdat['OFDA Response'].count())  # check available data
print(emdat['River Basin'].count())    # check available data

1716
1312


In [7]:
print(emdat.columns)

Index(['disasterno', 'Year', 'Event Name', 'Start Year', 'Start Month',
       'Start Day', 'End Year', 'End Month', 'End Day',
       'Total Damages, Adjusted ('000 US$)', 'Total Deaths', 'Total Affected',
       'Reconstruction Costs, Adjusted ('000 US$)', 'Disaster Subtype',
       'OFDA Response', 'River Basin'],
      dtype='object')


In [8]:
print(gdis.columns)

Index(['id', 'country', 'iso3', 'gwno', 'year', 'geo_id', 'geolocation',
       'level', 'adm1', 'adm2', 'adm3', 'location', 'historical',
       'hist_country', 'disastertype', 'disasterno', 'latitude', 'longitude'],
      dtype='object')


### Merged cleaned emdat with gdis on disasterno

In [13]:
merged_df = pd.merge(emdat, gdis, on='disasterno', how='right')
merged_df.shape

(82885, 33)

In [14]:
# Drop duplicate records based on GDIS 'id'
merged_df = merged_df.drop_duplicates(subset=['id'])
print("Merged Data Shape:", merged_df.shape)

Merged Data Shape: (9924, 33)


In [35]:
# Create grid_id from GDIS coordinates
merged_df['lat'] = merged_df['latitude'].round().astype(int)
merged_df['lon'] = merged_df['longitude'].round().astype(int)
merged_df['grid_id'] = merged_df['lat'].astype(str) + "_" + merged_df['lon'].astype(str)


### Create a look up table including all disasters over all locations, lat and lon, location names

In [36]:
#create a table for all locations for all disasters
df_all = merged_df.copy()

# Aggregate all params
agg_all = df_all.groupby('grid_id').agg(
    all_count = ('disasterno', 'count'), #count of all disasters
    all_damages = ("Total Damages, Adjusted ('000 US$)", 'sum'),
    lat = ('lat', 'first'),
    lon = ('lon', 'first'),
    primary_admin1 = ('adm1', lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan),
    unique_locations = ('location', lambda x: ', '.join(sorted(x.unique()))),
    all_deaths = ('Total Deaths', 'sum') if 'Total Deaths' in df_all.columns else (lambda x: np.nan),
    predominant_disaster = ('disastertype', lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan) if 'disastertype' in df_all.columns else (lambda x: np.nan),
    all_disastertypes = ('disastertype', lambda x: ', '.join(sorted(x.unique()))),
    # flood_count = ()
).reset_index()

In [138]:
agg_all
# Apply lambda function with proper f-string formatting
agg_all['event_summary'] = agg_all.apply(
    lambda row: f"""In total {row['all_count']} events across all disasters;
    predominant disaster: {row['predominant_disaster'] if not pd.isna(row['predominant_disaster']) else 'N/A'};
    types of disasters: {row['all_disastertypes'] if not pd.isna(row['all_disastertypes']) else 'N/A'};
    total damages: {row['all_damages'] if not pd.isna(row['all_damages']) else 'N/A'} ('000 US$);
    total deaths: {row['all_deaths'] if not pd.isna(row['all_deaths']) else 'N/A'}.""",
    axis=1
)
agg_all
# select_cols = ['grid_id','lat','lon','primary_admin1','unique_locations','predominant_disaster','all_disaster','total_damages','total_deaths','total_disaster_count']
# agg_all = agg_all[select_cols]
# save to CSV
# agg_all.to_csv('lookup.csv')

Unnamed: 0,grid_id,all_count,all_damages,lat,lon,primary_admin1,unique_locations,all_deaths,predominant_disaster,all_disastertypes,event_summary
0,-10_-36,1,531.0,-10,-36,Alagoas,Maceio,28.0,flood,flood,In total 1 events across all disasters;\n p...
1,-10_-37,6,895928.0,-10,-37,Alagoas,Alagoas,511.0,flood,"flood, landslide",In total 6 events across all disasters;\n p...
2,-10_-48,1,0.0,-10,-48,Tocantins,Tocantins,0.0,drought,drought,In total 1 events across all disasters;\n p...
3,-10_-66,1,0.0,-10,-66,Pando,Nueva Esperanza,25.0,flood,flood,In total 1 events across all disasters;\n p...
4,-10_-75,2,0.0,-10,-75,Pasco,Pasco,418.0,extreme temperature,extreme temperature,In total 2 events across all disasters;\n p...
...,...,...,...,...,...,...,...,...,...,...,...
2822,9_78,1,0.0,9,78,Tamil Nadu,Virudhunagar district,8.0,flood,flood,In total 1 events across all disasters;\n p...
2823,9_79,2,0.0,9,79,Tamil Nadu,"Ramanathapurum, Rameswaram",507.0,storm,storm,In total 2 events across all disasters;\n p...
2824,9_80,3,1354808.0,9,80,Kilinochchi,"Vavuniya, Kilinochchi district , Northern",212.0,flood,flood,In total 3 events across all disasters;\n p...
2825,9_81,4,325863.0,9,81,Trincomalee,"Mullaitivu district, Trincomalee",249.0,flood,"flood, storm",In total 4 events across all disasters;\n p...


### Next we focus on one disaster type  

In [139]:
disaster = 'flood' #changed the code here so we could generate detailed analysis for all disasters

#subset to flooding only
df = merged_df[merged_df['disastertype'].str.lower() == disaster].copy()
print(f"{disaster} Events Shape:", df.shape)

flood Events Shape: (4274, 36)


In [140]:
def compute_duration(row):
    try:
        start_date = pd.Timestamp(year=int(row['Start Year']), month=int(row['Start Month']), day=int(row['Start Day']))
        end_date = pd.Timestamp(year=int(row['End Year']), month=int(row['End Month']), day=int(row['End Day']))
        return (end_date - start_date).days
    except Exception:
        return np.nan

df['duration'] = df.apply(compute_duration, axis=1)


# Aggregate Data by Grid Cell

---
Aggregate Data by Grid Cell (Flood Events Only):

Group the data by grid_id and compute the following parameters:

*total_event_count*: Count of flood events in the grid.

*earliest_event_year*: Minimum of the 'year' column.

*latest_event_year*: Maximum of the 'year' column.

*total_damages*: Sum of "Total Damages, Adjusted ('000 US$)".

*primary_admin1*: Most common administrative region (adm1).

*unique_locations*: Comma-separated list of unique location names.

*avg_duration*: Average duration (in days) of flood events.

#Additional parameters (if available):


*avg_damage_per_flood*: Total_damages divided by total_event_count.

*total_deaths*: Sum of "Total Deaths".

*total_affected*: Sum of "Total Affected".

*reconstruction_costs*: Sum of "Reconstruction Costs, Adjusted ('000 US$)".

*predominant_subtype*: Mode of "Disaster Subtype".

*ofda_response_count*: Count of non-null "OFDA Response".

*predominant_river_basin*: Mode of "River Basin".

*flood_recurrence_interval*: (latest_event_year - earliest_event_year)divided by total_event_count.


In [141]:
def safe_divide(numerator, denominator):
    return numerator / denominator if denominator and denominator != 0 else np.nan

In [142]:
# Aggregate all params
aggregated = df.groupby('grid_id').agg(
    count = ('disasterno', 'count'),
    first_year = ('year', 'min'),
    last_year = ('year', 'max'),
    total_damages = ("Total Damages, Adjusted ('000 US$)", 'sum'),
    avg_duration = ('duration', 'mean'),
    total_deaths = ('Total Deaths', 'sum') if 'Total Deaths' in df.columns else (lambda x: np.nan),
    total_affected = ('Total Affected', 'sum') if 'Total Affected' in df.columns else (lambda x: np.nan),
    predominant_subtype = ('Disaster Subtype', lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan) if 'Disaster Subtype' in df.columns else (lambda x: np.nan),
    # ofda_response_count = ('OFDA Response', lambda x: x.notnull().sum()) if 'OFDA Response' in df.columns else (lambda x: np.nan),
    predominant_river_basin = ('River Basin', lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan) if 'River Basin' in df.columns else (lambda x: np.nan),
).reset_index()


In [143]:
# Compute derived parameters:
aggregated['avg_damage_per_event'] = aggregated.apply(
    lambda row: safe_divide(row['total_damages'], row['count']),
    axis=1
)
# Flood recurrence formula last-first divided by number of events in the interval
aggregated['recurrence_interval'] = aggregated.apply(
    lambda row: safe_divide(row['last_year'] - row['first_year'], row['count']),
    axis=1
)

In [144]:
aggregated['event_summary'] = aggregated.apply(
    lambda row: f"{row['count']} events from {int(row['first_year'])} to {int(row['last_year'])}, total damages: {row['total_damages'] if not pd.isna(row['total_damages']) else 'N/A'} ('000 US$)",
    axis=1
)

In [145]:
aggregated

Unnamed: 0,grid_id,count,first_year,last_year,total_damages,avg_duration,total_deaths,total_affected,predominant_subtype,predominant_river_basin,avg_damage_per_event,recurrence_interval,event_summary
0,-10_-36,1,2004,2004,531.0,1.000000,28.0,2254.0,Riverine flood,,531.000000,0.000000,"1 events from 2004 to 2004, total damages: 531..."
1,-10_-37,5,1967,2009,777910.0,4.400000,451.0,2110149.0,Riverine flood,,155582.000000,8.400000,"5 events from 1967 to 2009, total damages: 777..."
2,-10_-66,1,1987,1987,0.0,,25.0,20000.0,Riverine flood,,0.000000,0.000000,"1 events from 1987 to 1987, total damages: 0.0..."
3,-10_124,1,2010,2010,0.0,3.000000,16.0,200.0,Flash flood,,0.000000,0.000000,"1 events from 2010 to 2010, total damages: 0.0..."
4,-10_14,2,2004,2005,0.0,30.500000,1.0,12000.0,Riverine flood,"Kapacala, Cuanza, Okavango rivers",0.000000,0.500000,"2 events from 2004 to 2005, total damages: 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1831,9_77,1,1992,1992,351659.0,0.000000,51.0,500.0,Riverine flood,,351659.000000,0.000000,"1 events from 1992 to 1992, total damages: 351..."
1832,9_78,1,2015,2015,0.0,4.000000,8.0,200.0,Flash flood,,0.000000,0.000000,"1 events from 2015 to 2015, total damages: 0.0..."
1833,9_80,3,1981,2018,1354808.0,1.333333,212.0,396602.0,Flash flood,,451602.666667,12.333333,"3 events from 1981 to 2018, total damages: 135..."
1834,9_81,3,1994,2014,0.0,18.000000,43.0,1807655.0,Riverine flood,,0.000000,6.666667,"3 events from 1994 to 2014, total damages: 0.0..."


In [146]:
#show percentage of missing data
aggregated.isna().sum()/len(aggregated)

Unnamed: 0,0
grid_id,0.0
count,0.0
first_year,0.0
last_year,0.0
total_damages,0.0
avg_duration,0.084423
total_deaths,0.0
total_affected,0.0
predominant_subtype,0.168301
predominant_river_basin,0.581155


In [147]:
#compute an overall 'hazard score' based on damage, deaths and frequency
df = aggregated.copy()

# Step 1: Min-max normalization (ignoring NaN)
df['count_norm'] = (df['count'] - df['count'].min()) / (df['count'].max() - df['count'].min())
df['deaths_norm'] = (df['total_deaths'] - df['total_deaths'].min()) / (df['total_deaths'].max() - df['total_deaths'].min())
df['damages_norm'] = (df['total_damages'] - df['total_damages'].min()) / (df['total_damages'].max() - df['total_damages'].min())

# Step 2: Handle missing values
df['count_present'] = ~df['count'].isna()
df['deaths_present'] = ~df['total_deaths'].isna()
df['damages_present'] = ~df['total_damages'].isna()

# Step 3: Determine dynamic weights
df['num_present'] = df[['count_present', 'deaths_present', 'damages_present']].sum(axis=1)

# Assign weights dynamically
df['count_weight'] = df['count_present'] / df['num_present']
df['deaths_weight'] = df['deaths_present'] / df['num_present']
df['damages_weight'] = df['damages_present'] / df['num_present']

# Fill NA with 0 for Norms
df['count_norm'] = df['count_norm'].fillna(0)
df['deaths_norm'] = df['deaths_norm'].fillna(0)
df['damages_norm'] = df['damages_norm'].fillna(0)

# Step 4: Compute severity score dynamically
df['severity_score'] = (
df['count_norm'] * df['count_weight'] +
df['deaths_norm'] * df['deaths_weight'] +
df['damages_norm'] * df['damages_weight'])

#drop those cols
df= df.drop(df.filter(regex='norm|weight|present').columns, axis=1)


df.sort_values(by='severity_score', ascending=False).head(50)
## TO DO: is min-max normalization the best?? -> @Luca can you think about this, using deaths, loss, count, to compute a severity score


Unnamed: 0,grid_id,count,first_year,last_year,total_damages,avg_duration,total_deaths,total_affected,predominant_subtype,predominant_river_basin,avg_damage_per_event,recurrence_interval,event_summary,severity_score
1038,32_117,21,1980,2015,121621935.0,24.833333,18549.0,1033919000.0,Riverine flood,43710,5791521.0,1.666667,"21 events from 1980 to 2015, total damages: 12...",0.760331
1017,31_112,21,1980,2017,53587610.0,12.578947,4172.0,255533600.0,Riverine flood,Dalongtan Reservoir on Qingjiang River,2551791.0,1.761905,"21 events from 1980 to 2017, total damages: 53...",0.415149
446,10_-68,7,1994,2011,5440315.0,5.714286,30194.0,610977.0,Riverine flood,"Guaire, Carmen de Uria, Caugaguita, Upire, Toc...",777187.9,2.428571,"7 events from 1994 to 2011, total damages: 544...",0.41491
906,27_107,31,1988,2018,22364846.0,10.62069,1707.0,163476700.0,Riverine flood,Duliujiang,721446.6,0.967742,"31 events from 1988 to 2018, total damages: 22...",0.413474
931,28_112,26,1985,2016,32389279.0,9.73913,4048.0,215774100.0,Riverine flood,Xiangijiang River,1245742.0,1.192308,"26 events from 1985 to 2016, total damages: 32...",0.411237
900,26_93,19,1968,2015,13193169.0,21.5625,3993.0,160608600.0,Riverine flood,Brahmaputra,694377.3,2.473684,"19 events from 1968 to 2015, total damages: 13...",0.280241
894,26_86,12,1975,2013,9817307.0,12.333333,11172.0,162108500.0,Riverine flood,"Gandak, Kosi, Sone, Bagmati, Andhawara",818108.9,3.166667,"12 events from 1975 to 2013, total damages: 98...",0.272465
1110,34_72,17,1976,2015,14340690.0,8.294118,3249.0,27785290.0,Flash flood,"Khatayan, Alingar",843570.0,2.294118,"17 events from 1976 to 2015, total damages: 14...",0.25295
1771,7_80,22,1984,2018,625656.0,5.05,612.0,3469461.0,Riverine flood,Kelani,28438.91,1.545455,"22 events from 1984 to 2018, total damages: 62...",0.241804
1788,8_100,20,1975,2016,4183351.0,7.055556,1194.0,8149182.0,Riverine flood,"Kelantan, Lebir, Golok, Semerak, Tambatan Dira...",209167.5,2.05,"20 events from 1975 to 2016, total damages: 41...",0.235758


In [None]:
######################## NORMALIZATION  ################################

# Min-Max normalization is too sensitive to skewed values. Min-Max with z-score normalization is also inappropriate since our data is not normal

# Better to use log normalization for large values AND robust scaling 


from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt

df = aggregated.copy()

# Step 1: Log transformation 

epsilon = 1.0  # Add before log transformation to avoid log(0) = -inf

df['count_log'] = np.log1p(df['count'])  # log(1+x) handles zeros naturally
df['deaths_log'] = np.log1p(df['total_deaths'].fillna(0))
df['damages_log'] = np.log1p(df['total_damages'].fillna(0))

# Step 2: Apply robust scaling (uses median and quantiles instead of mean/std)

scaler = RobustScaler() # minimizes the effect of outliers

scaling_df = pd.DataFrame({
    'count_log': df['count_log'],
    'deaths_log': df['deaths_log'],
    'damages_log': df['damages_log']
}) # temporary df for the scaling


scaled_values = scaler.fit_transform(scaling_df)
df['count_scaled'] = scaled_values[:, 0]
df['deaths_scaled'] = scaled_values[:, 1]
df['damages_scaled'] = scaled_values[:, 2]

# Step 3: Handle missing values with dynamic weighting

df['count_present'] = ~df['count'].isna()
df['deaths_present'] = ~df['total_deaths'].isna()
df['damages_present'] = ~df['total_damages'].isna()

# Count how many metrics are present for each location
df['num_present'] = df[['count_present', 'deaths_present', 'damages_present']].sum(axis=1)

# Assign weights dynamically based on available metrics
df['count_weight'] = df['count_present'] / df['num_present']
df['deaths_weight'] = df['deaths_present'] / df['num_present']
df['damages_weight'] = df['damages_present'] / df['num_present']

# Step 4: Compute severity score with chosen weights

base_weights = {
    'count': 0.2,   
    'deaths': 0.5,   # Set the death weight the highest  // ADJUSTABLE
    'damages': 0.3   
}

# Apply both base weights and data availability weights
df['severity_score'] = (
    df['count_scaled'] * df['count_weight'] * base_weights['count'] +
    df['deaths_scaled'] * df['deaths_weight'] * base_weights['deaths'] +
    df['damages_scaled'] * df['damages_weight'] * base_weights['damages']
)

# Normalize severity score to 0-100 range
min_score = df['severity_score'].min()
max_score = df['severity_score'].max()
df['severity_score_normalized'] = (df['severity_score'] - min_score) / (max_score - min_score) * 100

# Step 5: Clean up working columns
severity_df = df.drop(df.filter(regex='log|scaled|present|weight').columns, axis=1)


top_severity = severity_df.sort_values(by='severity_score_normalized', ascending=False).head(20)
print(top_severity[['grid_id', 'count', 'total_deaths', 'total_damages', 'severity_score_normalized']]) 

# I checked some of the grid ids and they align with devastating flooding events

In [None]:

# Optional: Visualize the distribution of severity scores
plt.figure(figsize=(8, 4))
plt.hist(df['severity_score_normalized'], bins=20, edgecolor='black')
plt.title('Distribution of Flood Severity Scores')
plt.xlabel('Severity Score (0-100)')
plt.ylabel('Number of Locations')
plt.grid(axis='y', alpha=0.75)
plt.show()

# Optional: Create a simple visualization comparing the top components
top_n = 10
top_locs = severity_df.nlargest(top_n, 'severity_score_normalized')

# Prepare data for plotting
plot_data = pd.DataFrame({
    'Location': top_locs['grid_id'],
    'Event Count': np.log1p(top_locs['count']),
    'Deaths': np.log1p(top_locs['total_deaths']),
    'Damages': np.log1p(top_locs['total_damages']),
    'Severity Score': top_locs['severity_score_normalized']
})

# Plot
plt.figure(figsize=(8, 4))
bar_width = 0.2
r1 = np.arange(top_n)
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]
r4 = [x + bar_width for x in r3]

plt.bar(r1, plot_data['Event Count'], width=bar_width, label='Event Count (log)')
plt.bar(r2, plot_data['Deaths'], width=bar_width, label='Deaths (log)')
plt.bar(r3, plot_data['Damages'], width=bar_width, label='Damages (log)')
plt.bar(r4, plot_data['Severity Score']/10, width=bar_width, label='Severity Score/10')

plt.xlabel('Location')
plt.ylabel('Value')
plt.title('Components of Severity Score for Top 10 Locations')
plt.xticks([r + bar_width*1.5 for r in range(top_n)], plot_data['Location'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

In [155]:
# Append 'flood_' to all column names except 'grid_id'
agg_flood = df.copy()
agg_flood = agg_flood.rename(columns=lambda col: f"{disaster}_{col}" if col != "grid_id" else col)

#merge with all_disaster data
agg_flood = pd.merge(agg_all, agg_flood, on='grid_id', how='left')

#output:
agg_flood.to_csv("ui/flood_grid_summary.csv")
# print("Aggregated flood grid summary data saved to 'flood_grid_summary.csv'")

In [None]:
#@Luca: what is 'total affected feature'

## TO DO: is min-max normalization the best?? -> @Luca can you think about this, using deaths, loss, count, to compute a severity score
# TO DO: replicate the process for other disasters.
# TO DO: merge other data sets with this, e.g. NLP raw data.
# TO DO: add live predictions for 1-5 year

In [None]:
#### ---- old codes ------

## There is a lot of missing data for $ costs; Replacing 0s with NaNs

In [None]:
# Replace zeros with NaN for cost-related columns in the aggregated DataFrame.
# This assumes that a 0 value for damages or reconstruction costs should be treated as missing data.
cost_columns = ['total_damages', 'reconstruction_costs']
for col in cost_columns:
    aggregated[col] = aggregated[col].replace(0, np.nan)

# Recompute the derived parameter 'avg_damage_per_flood' after the replacement.
aggregated['avg_damage_per_flood'] = aggregated.apply(
    lambda row: safe_divide(row['total_damages'], row['total_event_count']),
    axis=1
)

# Select and Order Final Columns

## We now keep only the desired columns:

 1. grid_id  
 2. central_lat  
 3. central_lon  
 4. total_event_count  
 5. earliest_event_year  
 6. latest_event_year  
 7. total_damages  
 8. primary_admin1  
 9. unique_locations  
 10. avg_duration  
 11. avg_damage_per_flood  
 12. total_deaths  
 13. total_affected  
 14. reconstruction_costs  
 15. predominant_subtype  
 16. ofda_response_count  
 17. predominant_river_basin  
 18. flood_recurrence_interval
 19. event_summary

In [None]:
final_columns = ['grid_id', 'lat', 'lon', 'total_event_count',
                 'earliest_event_year', 'latest_event_year',
                 'total_damages', 'primary_admin1', 'unique_locations', 'avg_duration',
                 'avg_damage_per_flood', 'total_deaths', 'total_affected',
                 'reconstruction_costs', 'predominant_subtype', 'ofda_response_count',
                 'predominant_river_basin', 'flood_recurrence_interval', 'event_summary']

final_df = aggregated[final_columns].copy()


grid_id                      0.000000
central_lat                  0.000000
central_lon                  0.000000
total_event_count            0.000000
earliest_event_year          0.000000
latest_event_year            0.000000
total_damages                0.507625
primary_admin1               0.000000
country                      0.984205
unique_locations             0.000000
avg_duration                 0.084423
avg_damage_per_flood         0.507625
total_deaths                 0.000000
total_affected               0.000000
reconstruction_costs         0.994009
predominant_subtype          0.168301
ofda_response_count          0.000000
predominant_river_basin      0.581155
flood_recurrence_interval    0.000000
event_summary                0.000000
dtype: float64

Unnamed: 0,grid_id,count,first_year,last_year,total_damages,avg_duration,total_deaths,total_affected,predominant_subtype,predominant_river_basin,avg_damage_per_event,recurrence_interval,event_summary,severity_score
1038,32_117,21,1980,2015,121621935.0,24.833333,18549.0,1033919000.0,Riverine flood,43710,5791521.0,1.666667,"21 events from 1980 to 2015, total damages: 12...",0.760331
1017,31_112,21,1980,2017,53587610.0,12.578947,4172.0,255533600.0,Riverine flood,Dalongtan Reservoir on Qingjiang River,2551791.0,1.761905,"21 events from 1980 to 2017, total damages: 53...",0.415149
446,10_-68,7,1994,2011,5440315.0,5.714286,30194.0,610977.0,Riverine flood,"Guaire, Carmen de Uria, Caugaguita, Upire, Toc...",777187.9,2.428571,"7 events from 1994 to 2011, total damages: 544...",0.41491
906,27_107,31,1988,2018,22364846.0,10.62069,1707.0,163476700.0,Riverine flood,Duliujiang,721446.6,0.967742,"31 events from 1988 to 2018, total damages: 22...",0.413474
931,28_112,26,1985,2016,32389279.0,9.73913,4048.0,215774100.0,Riverine flood,Xiangijiang River,1245742.0,1.192308,"26 events from 1985 to 2016, total damages: 32...",0.411237
900,26_93,19,1968,2015,13193169.0,21.5625,3993.0,160608600.0,Riverine flood,Brahmaputra,694377.3,2.473684,"19 events from 1968 to 2015, total damages: 13...",0.280241
894,26_86,12,1975,2013,9817307.0,12.333333,11172.0,162108500.0,Riverine flood,"Gandak, Kosi, Sone, Bagmati, Andhawara",818108.9,3.166667,"12 events from 1975 to 2013, total damages: 98...",0.272465
1110,34_72,17,1976,2015,14340690.0,8.294118,3249.0,27785290.0,Flash flood,"Khatayan, Alingar",843570.0,2.294118,"17 events from 1976 to 2015, total damages: 14...",0.25295
1771,7_80,22,1984,2018,625656.0,5.05,612.0,3469461.0,Riverine flood,Kelani,28438.91,1.545455,"22 events from 1984 to 2018, total damages: 62...",0.241804
1788,8_100,20,1975,2016,4183351.0,7.055556,1194.0,8149182.0,Riverine flood,"Kelantan, Lebir, Golok, Semerak, Tambatan Dira...",209167.5,2.05,"20 events from 1975 to 2016, total damages: 41...",0.235758


# Save data to csv

In [None]:
final_columns = ['grid_id', 'lat', 'lon', 'total_event_count',
                 'earliest_event_year', 'latest_event_year',
                 'total_damages', 'primary_admin1', 'unique_locations', 'avg_duration',
                 'avg_damage_per_flood', 'total_deaths', 'total_affected',
                 'reconstruction_costs', 'predominant_subtype', 'ofda_response_count',
                 'predominant_river_basin', 'flood_recurrence_interval', 'event_summary']

final_df = df.copy()

final_df.to_csv('/content/drive/MyDrive/mml_flood/UI/flood_grid_summary.csv', index=False)
print("Aggregated flood grid summary data saved to '/content/drive/MyDrive/mml_flood/UI/flood_grid_summary.csv'.")

Aggregated flood grid summary data saved to '/content/drive/MyDrive/mml_flood/UI/flood_grid_summary.csv'.
