# Phase 2: Data Preprocessing
# Tiền xử lý Dữ liệu

## Mục tiêu / Objectives:
1. Load raw data from CSV
2. Handle missing values
3. Remove duplicates
4. Fix data types
5. Standardize categorical values
6. Apply feature engineering
7. Save processed data

---

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os

# Add src to path
sys.path.append('../src')

from feature_engineering import engineer_all_features
from outlier_detection import analyze_outliers, apply_log_transformation
from data_split import create_train_test_split

warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 1. Load Data / Tải dữ liệu

In [None]:
# Load the dataset
data_path = '../data/raw/global_disaster_response_2018_2024.csv'

# Check if file exists
if not os.path.exists(data_path):
    print(f"ERROR: Data file not found at {data_path}")
    print("Please download the dataset from Kaggle and place it in data/raw/ directory")
    print("Dataset URL: https://www.kaggle.com/datasets/mubeenshehzadi/global-disaster-2018-2024")
else:
    df = pd.read_csv(data_path)
    print("Dataset loaded successfully!")
    print(f"\nDataset shape: {df.shape}")
    print(f"Number of rows: {len(df)}")
    print(f"Number of columns: {len(df.columns)}")

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Display column information
print("Column Information:")
df.info()

In [None]:
# Display basic statistics
print("Basic Statistics:")
df.describe()

## 2. Data Cleaning / Làm sạch dữ liệu

### 2.1 Convert date to datetime

In [None]:
# Convert date column to datetime
print("Converting 'date' column to datetime...")
df['date'] = pd.to_datetime(df['date'])
print(f"Date column type: {df['date'].dtype}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

### 2.2 Check for missing values

In [None]:
# Check for missing values
print("Missing values per column:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percentage
})

print(missing_df[missing_df['Missing Count'] > 0])

if missing_values.sum() == 0:
    print("\n✓ No missing values found!")
else:
    print(f"\nTotal missing values: {missing_values.sum()}")

In [None]:
# Handle missing values if any
# Strategy will depend on the column and amount of missing data

if missing_values.sum() > 0:
    print("Handling missing values...")
    
    # For numeric columns: fill with median
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"  {col}: filled with median ({median_val:.2f})")
    
    # For categorical columns: fill with mode
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            mode_val = df[col].mode()[0]
            df[col].fillna(mode_val, inplace=True)
            print(f"  {col}: filled with mode ({mode_val})")
    
    print("\n✓ Missing values handled!")
else:
    print("✓ No missing values to handle!")

### 2.3 Remove duplicates

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print(f"Removing {duplicates} duplicate rows...")
    df = df.drop_duplicates()
    print(f"✓ Duplicates removed! New shape: {df.shape}")
else:
    print("✓ No duplicates found!")

### 2.4 Fix data types

In [None]:
# Verify and fix data types
print("Current data types:")
print(df.dtypes)

# Ensure numeric columns are numeric
numeric_columns = ['severity_index', 'casualties', 'economic_loss_usd', 
                  'response_time_hours', 'aid_amount_usd', 
                  'response_efficiency_score', 'recovery_days',
                  'latitude', 'longitude']

for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print("\n✓ Data types verified and fixed!")

### 2.5 Standardize categorical values

In [None]:
# Check unique values in categorical columns
print("Unique values in categorical columns:")
print(f"\nCountries ({df['country'].nunique()}): {sorted(df['country'].unique())}")
print(f"\nDisaster Types ({df['disaster_type'].nunique()}): {sorted(df['disaster_type'].unique())}")

# Standardize (trim whitespace, fix capitalization)
df['country'] = df['country'].str.strip()
df['disaster_type'] = df['disaster_type'].str.strip()

print("\n✓ Categorical values standardized!")

## 3. Feature Engineering / Kỹ thuật đặc trưng

Apply comprehensive feature engineering using our custom module.

In [None]:
# Apply feature engineering
print("Applying feature engineering...")
print("=" * 60)

df_engineered, encoders = engineer_all_features(df, fit=True)

print("\n" + "=" * 60)
print(f"Original features: {df.shape[1]}")
print(f"Engineered features: {df_engineered.shape[1]}")
print(f"New features added: {df_engineered.shape[1] - df.shape[1]}")
print("=" * 60)

In [None]:
# Display new features
print("New engineered features:")
new_features = [col for col in df_engineered.columns if col not in df.columns]
for i, feature in enumerate(new_features, 1):
    print(f"{i:2d}. {feature}")

print(f"\nTotal new features: {len(new_features)}")

In [None]:
# Display sample of engineered data
print("Sample of engineered data:")
df_engineered.head()

## 4. Outlier Detection / Phát hiện outliers

Analyze outliers using IQR and Z-score methods.

In [None]:
# Analyze outliers
outlier_results = analyze_outliers(df_engineered)

## 5. Handle Skewness / Xử lý độ lệch

Apply log transformation for highly skewed features.

In [None]:
# Apply log transformation to highly skewed features
skewed_columns = ['casualties', 'economic_loss_usd', 'aid_amount_usd']
df_engineered = apply_log_transformation(df_engineered, skewed_columns)

print("\n✓ Log transformations applied!")

## 6. Train-Test Split / Chia dữ liệu

Split data into train and test sets with stratification.

In [None]:
# Create train-test split
train_df, test_df, encoders = create_train_test_split(
    df_engineered,
    target_column='disaster_type',
    test_size=0.2,
    random_state=42,
    output_dir='../data/processed',
    encoders=encoders,
    save=True
)

## 7. Final Verification / Kiểm tra cuối cùng

In [None]:
# Verify processed data
print("=" * 80)
print("PREPROCESSING SUMMARY")
print("=" * 80)

print(f"\n1. Original Data:")
print(f"   - Shape: {df.shape}")
print(f"   - Features: {df.shape[1]}")

print(f"\n2. Engineered Data:")
print(f"   - Shape: {df_engineered.shape}")
print(f"   - Features: {df_engineered.shape[1]}")
print(f"   - New features: {df_engineered.shape[1] - df.shape[1]}")

print(f"\n3. Train Set:")
print(f"   - Shape: {train_df.shape}")
print(f"   - Percentage: {len(train_df)/len(df_engineered)*100:.1f}%")

print(f"\n4. Test Set:")
print(f"   - Shape: {test_df.shape}")
print(f"   - Percentage: {len(test_df)/len(df_engineered)*100:.1f}%")

print(f"\n5. Data Quality:")
print(f"   - Missing values: {df_engineered.isnull().sum().sum()}")
print(f"   - Duplicates: {df_engineered.duplicated().sum()}")

print("\n" + "=" * 80)
print("✓ PREPROCESSING COMPLETED SUCCESSFULLY!")
print("=" * 80)

## 8. Save Complete Engineered Dataset

In [None]:
# Save the complete engineered dataset
output_path = '../data/processed/full_engineered_data.csv'
df_engineered.to_csv(output_path, index=False)
print(f"✓ Complete engineered dataset saved to: {output_path}")

---

## Next Steps

1. Proceed to `phase2_eda.ipynb` for Exploratory Data Analysis
2. Create visualizations and insights
3. Move to model building in Phase 3

---