# Phase 2: Exploratory Data Analysis (EDA)
# Phân tích Khám phá Dữ liệu

## Mục tiêu / Objectives:
1. Analyze all 12 features thoroughly
2. Create 50+ visualizations
3. Continental analysis (6 continents)
4. Temporal trends analysis (2018-2024)
5. Correlation analysis
6. Geographic distribution
7. Generate insights for modeling

---

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import folium
import warnings
import os

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Create figures directory
os.makedirs('../reports/figures/eda', exist_ok=True)

print('Libraries imported successfully!')
print(f'Pandas version: {pd.__version__}')
print(f'Matplotlib version: {plt.matplotlib.__version__}')
print(f'Seaborn version: {sns.__version__}')

## 1. Load Processed Data

In [None]:
# Load the processed/engineered data
df = pd.read_csv('../data/processed/full_engineered_data.csv')

# Convert date back to datetime
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])

print(f'Dataset shape: {df.shape}')
print(f'\nColumns: {list(df.columns)}')
df.head()

## 2. Overall Statistics

In [None]:
print('='*80)
print('OVERALL DATASET STATISTICS')
print('='*80)

print(f'\n1. Time Period:')
print(f'   Start: {df["date"].min()}')
print(f'   End: {df["date"].max()}')
print(f'   Duration: {(df["date"].max() - df["date"].min()).days} days')

print(f'\n2. Geographic Coverage:')
print(f'   Countries: {df["country"].nunique()}')
print(f'   Continents: {df["continent"].nunique()}')

print(f'\n3. Disaster Types:')
print(f'   Types: {df["disaster_type"].nunique()}')
print(f'   Total disasters: {len(df)}')

print(f'\n4. Impact:')
print(f'   Total casualties: {df["casualties"].sum():,.0f}')
print(f'   Total economic loss: ${df["economic_loss_usd"].sum():,.0f} USD')
print(f'   Total aid amount: ${df["aid_amount_usd"].sum():,.0f} USD')

print(f'\n5. Response:')
print(f'   Avg response time: {df["response_time_hours"].mean():.2f} hours')
print(f'   Avg response efficiency: {df["response_efficiency_score"].mean():.2f}/100')
print(f'   Avg recovery days: {df["recovery_days"].mean():.2f} days')

## 3. Feature Analysis (All 12 Features)

### 3.1 Date Analysis

In [None]:
# TODO: Implement date analysis
# - Time series of disaster frequency
# - Yearly trends
# - Monthly patterns
# - Seasonal distribution
print('Date analysis to be implemented...')

### 3.2 Country Analysis

In [None]:
# TODO: Implement country analysis
# - Distribution of disasters by country
# - Top countries by casualties
# - Top countries by economic loss
print('Country analysis to be implemented...')

### 3.3 Disaster Type Analysis

In [None]:
# TODO: Implement disaster type analysis
print('Disaster type analysis to be implemented...')

## 4. Continental Analysis (CRITICAL)

### 4.1 Disasters by Continent

In [None]:
# TODO: Implement continental analysis
# - Frequency by continent
# - Economic loss by continent
# - Casualties by continent
# - Response efficiency by continent
# - Aid distribution by continent
# - Recovery time by continent
print('Continental analysis to be implemented...')

## 5. Correlation Analysis

In [None]:
# TODO: Create correlation heatmap for all numeric features
print('Correlation analysis to be implemented...')

## 6. Geographic Visualization

In [None]:
# TODO: Create world map with disaster locations
# Using folium or plotly
print('Geographic visualization to be implemented...')

## 7. Summary of Key Findings

In [None]:
print('='*80)
print('KEY FINDINGS FROM EDA')
print('='*80)
print('\n1. Most common disaster type:', df['disaster_type'].mode()[0])
print('2. Most affected country:', df.groupby('country')['casualties'].sum().idxmax())
print('3. Most affected continent:', df.groupby('continent')['casualties'].sum().idxmax())
print('\n✓ EDA COMPLETED!')
print('Next: Proceed to Phase 3 - Model Building')