# NASA EONET Events - Exploratory Analysis (Pandas)

This notebook uses pandas to analyze natural events data from NASA EONET.


In [19]:
import pandas as pd
import numpy as np
from pathlib import Path
from pprint import pprint

# Load the processed data
events_df = pd.read_csv("../data/processed/events.csv")
occurrences_df = pd.read_csv("../data/processed/occurrences.csv")

# Convert date columns to datetime
events_df['date'] = pd.to_datetime(events_df['date'])




In [20]:
occurrences_df.tail()

Unnamed: 0,event_id,event_title,event_description,event_link,category_ids,category_titles,occurrence_date,occurrence_type,longitude,latitude,year,month,day,region,country,continent
12835,EONET_2878,Iceberg B09B,,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,seaLakeIce,Sea and Lake Ice,2011-08-30 00:00:00+00:00,Point,143.3638,-66.366,2011,8,30,Tasmania,Australia,Antarctica
12836,EONET_2878,Iceberg B09B,,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,seaLakeIce,Sea and Lake Ice,2013-04-08 00:00:00+00:00,Point,142.6649,-66.7354,2013,4,8,Tasmania,Australia,Antarctica
12837,EONET_2878,Iceberg B09B,,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,seaLakeIce,Sea and Lake Ice,2018-01-12 00:00:00+00:00,Point,143.39,-66.37,2018,1,12,Tasmania,Australia,Antarctica
12838,EONET_2997,Iceberg C21B,,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,seaLakeIce,Sea and Lake Ice,2011-08-30 00:00:00+00:00,Point,96.18,-64.9955,2011,8,30,Kerguelen,French Southern Territories,Antarctica
12839,EONET_980,"Fuego Volcano, Guatemala",,https://eonet.gsfc.nasa.gov/api/v3/events/EONE...,volcanoes,Volcanoes,2002-01-04 00:00:00+00:00,Point,-90.88,14.473,2002,1,4,Sacatepequez,Guatemala,North America


In [21]:
events_df["category_titles"].value_counts()
pprint(events_df["category_titles"])

0              Wildfires
1              Wildfires
2              Wildfires
3              Wildfires
4              Wildfires
              ...       
5674    Sea and Lake Ice
5675    Sea and Lake Ice
5676    Sea and Lake Ice
5677    Sea and Lake Ice
5678           Volcanoes
Name: category_titles, Length: 5679, dtype: object


In [24]:
events_df.groupby(events_df['date'].dt.year).size()

date
2002       1
2011       5
2016       2
2019      13
2020       2
2021       4
2022       7
2023      27
2024    4126
2025    1492
dtype: int64

## Question 1: What was the most frequent event category in 2024?


In [13]:
# Filter data for 2024
import pprint


events_df = events_df[events_df['year'] == 2024].copy()

pprint(events_df["category_titles"]).value_counts()

# Split category_titles by comma and explode to create one row per category
df_categories = df_2024['category_titles'].str.split(',').explode().str.strip()

# Count occurrences of each category
category_counts = df_categories.value_counts()

# Get the most frequent category
most_frequent = category_counts.head(3)

print("Most frequent event category in 2024:")
print(most_frequent.to_frame('event_count'))

# Save category counts to CSV
category_counts_df = category_counts.reset_index()
category_counts_df.columns = ['category_title', 'event_count']
output_path = Path('../data/processed/events_2025_category_counts.csv')
category_counts_df.to_csv(output_path, index=False)
print(f"\nCategory counts saved to {output_path}")


KeyError: 'year'

## Question 2: What was the distribution of events by category in 2024?


In [4]:
# Split category_titles and explode
df_categories = df_2024['category_titles'].str.split(',').explode().str.strip()

# Count occurrences
category_counts = df_categories.value_counts().reset_index()
category_counts.columns = ['category_title', 'event_count']

# Calculate percentage
total_events = category_counts['event_count'].sum()
category_counts['percentage'] = (category_counts['event_count'] / total_events * 100).round(2)

# Sort by event count
category_counts = category_counts.sort_values('event_count', ascending=False)

print("Distribution of events by category in 2024:")
print(category_counts.to_string(index=False))


Distribution of events by category in 2024:
  category_title  event_count  percentage
       Wildfires         4114       97.56
Sea and Lake Ice           96        2.28
       Volcanoes            7        0.17


## Question 3: How many occurrences were there per month in 2024?


In [5]:
# Filter for 2024 and non-null occurrence dates
df_2024_with_dates = df_2024[df_2024['occurrence_date'].notna()].copy()

# Group by month and count occurrences
month_counts = df_2024_with_dates.groupby('month').size().reset_index(name='occurrence_count')

# Add month names
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April',
    5: 'May', 6: 'June', 7: 'July', 8: 'August',
    9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
month_counts['month_name'] = month_counts['month'].map(month_names)

# Reorder columns and sort by month
month_counts = month_counts[['month', 'month_name', 'occurrence_count']].sort_values('month')

print("Occurrences per month in 2024:")
print(month_counts.to_string(index=False))


Occurrences per month in 2024:
 month month_name  occurrence_count
     1    January                12
     2   February                22
     3      March                46
     4      April                19
     5        May                87
     6       June               558
     7       July               931
     8     August              1115
     9  September               679
    10    October               332
    11   November               169
    12   December               247


## Question 4: Which week had the most occurrences in 2024?


In [6]:
# Filter for 2024 and non-null occurrence dates
df_2024_with_dates = df_2024[df_2024['occurrence_date'].notna()].copy()

# Extract year and week number
df_2024_with_dates['year_num'] = df_2024_with_dates['occurrence_date'].dt.isocalendar().year
df_2024_with_dates['week_number'] = df_2024_with_dates['occurrence_date'].dt.isocalendar().week

# Group by year and week, count occurrences
week_counts = df_2024_with_dates.groupby(['year_num', 'week_number']).agg({
    'occurrence_date': ['count', 'min']
}).reset_index()

week_counts.columns = ['year', 'week_number', 'occurrence_count', 'week_start']

# Sort by occurrence count and get the top one
top_week = week_counts.sort_values('occurrence_count', ascending=False).head(1)

print("Week with the most occurrences in 2024:")
print(top_week[['year', 'week_number', 'week_start', 'occurrence_count']].to_string(index=False))


Week with the most occurrences in 2024:
 year  week_number                week_start  occurrence_count
 2024           32 2024-08-05 00:05:00+00:00               362


## Question 5: For each category, which month had the most events?


In [15]:
# Create a dataframe with exploded categories
df_cat_month = df_2024.copy()
df_cat_month['category_title'] = df_cat_month['category_titles'].str.split(',').apply(lambda x: [cat.strip() for cat in x])
df_cat_month = df_cat_month.explode('category_title')

# Group by category and month, count events
category_month_counts = df_cat_month.groupby(['category_title', 'month']).size().reset_index(name='event_count')

# Add month names
category_month_counts['month_name'] = category_month_counts['month'].map(month_names)

# For each category, find the month with the most events
result = category_month_counts.loc[
    category_month_counts.groupby('category_title')['event_count'].idxmax()
]

# Select and order columns
result = result[['category_title', 'month_name', 'event_count']].sort_values('category_title')

print("Month with most events for each category in 2024:")
print(result.to_string(index=False))


Month with most events for each category in 2024:
  category_title month_name  event_count
Sea and Lake Ice      March           72
       Volcanoes   December            3
       Wildfires     August         1111


## Question 6: In which countries or continents were the largest number of events concentrated in 2024?

Note: This requires reverse geocoding. For now, we'll analyze by geographic regions based on coordinates.


In [16]:
# Filter for 2024 with valid coordinates
df_geo = df_2024[(df_2024['latitude'].notna()) & (df_2024['longitude'].notna())].copy()

# Define regions based on latitude
def get_latitude_region(lat):
    if pd.isna(lat):
        return 'Unknown'
    elif -90 <= lat <= -23.5:
        return 'Southern Hemisphere (Temperate)'
    elif -23.5 < lat <= 0:
        return 'Southern Hemisphere (Tropical)'
    elif 0 < lat <= 23.5:
        return 'Northern Hemisphere (Tropical)'
    elif 23.5 < lat <= 90:
        return 'Northern Hemisphere (Temperate)'
    else:
        return 'Unknown'

# Define longitude regions
def get_longitude_region(lon):
    if pd.isna(lon):
        return 'Unknown'
    elif -180 <= lon <= -30:
        return 'Americas'
    elif -30 < lon <= 60:
        return 'Europe/Africa'
    elif 60 < lon <= 180:
        return 'Asia/Pacific'
    else:
        return 'Unknown'

# Apply region functions
df_geo['region'] = df_geo['latitude'].apply(get_latitude_region)
df_geo['longitude_region'] = df_geo['longitude'].apply(get_longitude_region)

# Group by regions and count events
region_counts = df_geo.groupby(['region', 'longitude_region']).size().reset_index(name='event_count')

# Sort by event count
region_counts = region_counts.sort_values('event_count', ascending=False)

print("Event concentration by geographic regions in 2024:")
print(region_counts.to_string(index=False))


Event concentration by geographic regions in 2024:
                         region longitude_region  event_count
 Southern Hemisphere (Tropical)    Europe/Africa         1102
Northern Hemisphere (Temperate)         Americas          986
 Southern Hemisphere (Tropical)         Americas          813
 Southern Hemisphere (Tropical)     Asia/Pacific          567
Northern Hemisphere (Temperate)     Asia/Pacific          332
 Northern Hemisphere (Tropical)    Europe/Africa          206
Southern Hemisphere (Temperate)    Europe/Africa          135
Southern Hemisphere (Temperate)         Americas           88
Southern Hemisphere (Temperate)     Asia/Pacific           52
Northern Hemisphere (Temperate)    Europe/Africa           27
 Northern Hemisphere (Tropical)         Americas            5


## Summary: All Answers

Printing all answers together for easy reference:


In [17]:
# Define month names mapping (if not already defined)
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April',
    5: 'May', 6: 'June', 7: 'July', 8: 'August',
    9: 'September', 10: 'October', 11: 'November', 12: 'December'
}

# Define region functions (if not already defined)
def get_latitude_region(lat):
    if pd.isna(lat):
        return 'Unknown'
    elif -90 <= lat <= -23.5:
        return 'Southern Hemisphere (Temperate)'
    elif -23.5 < lat <= 0:
        return 'Southern Hemisphere (Tropical)'
    elif 0 < lat <= 23.5:
        return 'Northern Hemisphere (Tropical)'
    elif 23.5 < lat <= 90:
        return 'Northern Hemisphere (Temperate)'
    else:
        return 'Unknown'

def get_longitude_region(lon):
    if pd.isna(lon):
        return 'Unknown'
    elif -180 <= lon <= -30:
        return 'Americas'
    elif -30 < lon <= 60:
        return 'Europe/Africa'
    elif 60 < lon <= 180:
        return 'Asia/Pacific'
    else:
        return 'Unknown'

print("="*80)
print("SUMMARY OF ALL ANSWERS - NASA EONET EVENTS 2024")
print("="*80)

print("\n1. MOST FREQUENT EVENT CATEGORY IN 2024:")
print("-" * 80)
most_frequent = df_2024['category_titles'].str.split(',').explode().str.strip().value_counts().head(1)
print(most_frequent.to_frame('event_count'))

print("\n2. DISTRIBUTION OF EVENTS BY CATEGORY IN 2024:")
print("-" * 80)
category_dist = df_2024['category_titles'].str.split(',').explode().str.strip().value_counts().reset_index()
category_dist.columns = ['category_title', 'event_count']
category_dist['percentage'] = (category_dist['event_count'] / category_dist['event_count'].sum() * 100).round(2)
print(category_dist.to_string(index=False))

print("\n3. OCCURRENCES PER MONTH IN 2024:")
print("-" * 80)
df_2024_with_dates = df_2024[df_2024['occurrence_date'].notna()].copy()
month_counts = df_2024_with_dates.groupby('month').size().reset_index(name='occurrence_count')
month_counts['month_name'] = month_counts['month'].map(month_names)
month_counts = month_counts[['month', 'month_name', 'occurrence_count']].sort_values('month')
print(month_counts.to_string(index=False))

print("\n4. WEEK WITH THE MOST OCCURRENCES IN 2024:")
print("-" * 80)
df_2024_with_dates['year_num'] = df_2024_with_dates['occurrence_date'].dt.isocalendar().year
df_2024_with_dates['week_number'] = df_2024_with_dates['occurrence_date'].dt.isocalendar().week
week_counts = df_2024_with_dates.groupby(['year_num', 'week_number']).agg({
    'occurrence_date': ['count', 'min']
}).reset_index()
week_counts.columns = ['year', 'week_number', 'occurrence_count', 'week_start']
top_week = week_counts.sort_values('occurrence_count', ascending=False).head(1)
print(top_week[['year', 'week_number', 'week_start', 'occurrence_count']].to_string(index=False))

print("\n5. MONTH WITH MOST EVENTS FOR EACH CATEGORY IN 2024:")
print("-" * 80)
df_cat_month = df_2024.copy()
df_cat_month['category_title'] = df_cat_month['category_titles'].str.split(',').apply(lambda x: [cat.strip() for cat in x])
df_cat_month = df_cat_month.explode('category_title')
category_month_counts = df_cat_month.groupby(['category_title', 'month']).size().reset_index(name='event_count')
category_month_counts['month_name'] = category_month_counts['month'].map(month_names)
result = category_month_counts.loc[category_month_counts.groupby('category_title')['event_count'].idxmax()]
result = result[['category_title', 'month_name', 'event_count']].sort_values('category_title')
print(result.to_string(index=False))

print("\n6. EVENT CONCENTRATION BY GEOGRAPHIC REGIONS IN 2024:")
print("-" * 80)
df_geo = df_2024[(df_2024['latitude'].notna()) & (df_2024['longitude'].notna())].copy()
df_geo['region'] = df_geo['latitude'].apply(get_latitude_region)
df_geo['longitude_region'] = df_geo['longitude'].apply(get_longitude_region)
region_counts = df_geo.groupby(['region', 'longitude_region']).size().reset_index(name='event_count')
region_counts = region_counts.sort_values('event_count', ascending=False)
print(region_counts.to_string(index=False))

print("\n" + "="*80)
print("END OF SUMMARY")
print("="*80)


SUMMARY OF ALL ANSWERS - NASA EONET EVENTS 2024

1. MOST FREQUENT EVENT CATEGORY IN 2024:
--------------------------------------------------------------------------------
                 event_count
category_titles             
Wildfires               4114

2. DISTRIBUTION OF EVENTS BY CATEGORY IN 2024:
--------------------------------------------------------------------------------
  category_title  event_count  percentage
       Wildfires         4114       95.39
Sea and Lake Ice          192        4.45
       Volcanoes            7        0.16

3. OCCURRENCES PER MONTH IN 2024:
--------------------------------------------------------------------------------
 month month_name  occurrence_count
     1    January                24
     2   February                39
     3      March                82
     4      April                26
     5        May                88
     6       June               558
     7       July               931
     8     August              1117
     