In [6]:
# notebooks/01_data_preprocessing.ipynb
import pandas as pd
import numpy as np
from datetime import datetime
import os

# Create necessary directories if they don't exist
os.makedirs('../data/processed', exist_ok=True)

# Load data with header=None to read all rows as data
df = pd.read_csv('../data/raw/Nadmo_cleaned_refined.csv', header=None)

# Display the first few rows to understand the structure
print("First 10 rows of the dataset:")
print(df.head(10))

print("\nColumn names:", list(df.columns))

# Check if the first row contains file info
if df.iloc[0, 0] == 'Nadmo_cleaned_refined.csv':
    # The first row is not data but contains file info, skip it
    df = df.iloc[1:].reset_index(drop=True)
    print("\nRemoved first row with file info")

# Now assign column names based on the actual data structure
# From your output, we can see:
# Column 0: date (16/7/2021, 2019-11-02 00:00:00, etc.)
# Column 1: disaster_type (DOMESTIC FIRE, WINDSTORM, etc.)
# Column 2: id (1, 1, 1, etc.)
# Column 3: location (ESIAMA, AGONA NKWANTA, etc.)
# Column 4: severity_index (1, 1, 1, etc.)
df.columns = ['date', 'disaster_type', 'id', 'location', 'severity_index']

# Display the first few rows to verify
print("\nFirst 5 rows after column assignment:")
print(df.head())

# Check data types
print("\nData types:")
print(df.dtypes)

# Convert date to datetime
def parse_date(date_str):
    try:
        return pd.to_datetime(date_str, errors='coerce')
    except:
        return pd.NaT

df['date'] = df['date'].apply(parse_date)

# Extract temporal features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek

# Check for any date parsing issues
print(f"\nNumber of invalid dates: {df['date'].isna().sum()}")

# Standardize disaster types
df['disaster_type'] = df['disaster_type'].str.upper().str.strip()

# Create a comprehensive mapping dictionary
disaster_mapping = {
    'RAINSTORM': 'RAIN_STORM',
    'RAINSTROM': 'RAIN_STORM',
    'FLOODING': 'FLOOD',
    'WINDSTORM': 'WIND_STORM',
    'DOMESTIC FIRE': 'FIRE',
    'COMMERCIAL FIRE': 'FIRE',
    'BUSH FIRE': 'FIRE',
    'MAN MADE': 'MAN_MADE',
    'MAN-MADE': 'MAN_MADE',
    'TIDAL WAVES': 'TIDAL_WAVE',
    'TIDAL WAVE': 'TIDAL_WAVE',
    'PEST&INSECT INFESTATION': 'PEST_INFESTATION',
    'INSECT/PESTICIDE': 'PEST_INFESTATION',
    'GAS EXPLOSION': 'EXPLOSION',
    'CHEEMICAL EXPLOSION': 'EXPLOSION',
    'ROAD ACCIDENT': 'ACCIDENT',
    'BUILDING COLLAPSE': 'COLLAPSE',
    'LANDSLIDE': 'LANDSLIDE',
    'EPIDEMICS (AVIAN FLU)': 'EPIDEMIC',
    'BIRD FLU': 'EPIDEMIC',
    'DROWNING': 'DROWNING',
    'GALAMSEY PIT COLLAPSE': 'COLLAPSE',
    'LIGHTENING': 'LIGHTNING',
    'RAINSTROM': 'RAIN_STORM',
    'FLOODING': 'FLOOD',
    'MAN MADE ': 'MAN_MADE',
    'DOMESTIC FIRE  ': 'FIRE',
    'RAINSTORM  ': 'RAIN_STORM',
    'FLOOD  ': 'FLOOD',
    'WINDSTORM  ': 'WIND_STORM',
    'TIDAL WAVES': 'TIDAL_WAVE',
    'PEST&INSECT INFESTATION': 'PEST_INFESTATION',
    'INSECT/PESTICIDE': 'PEST_INFESTATION',
    'GAS EXPLOSION': 'EXPLOSION',
    'CHEEMICAL EXPLOSION': 'EXPLOSION',
    'ROAD ACCIDENT': 'ACCIDENT',
    'BUILDING COLLAPSE': 'COLLAPSE',
    'LANDSLIDE': 'LANDSLIDE',
    'EPIDEMICS (AVIAN FLU)': 'EPIDEMIC',
    'BIRD FLU': 'EPIDEMIC',
    'DROWNING': 'DROWNING',
    'GALAMSEY PIT COLLAPSE': 'COLLAPSE',
    'drown': 'DROWNING',
    'domestic fire': 'FIRE',
    'Rainstorm': 'RAIN_STORM',
    'Rainstorm  ': 'RAIN_STORM',
    'Flood': 'FLOOD',
    'Flood  ': 'FLOOD',
    'domestic fire': 'FIRE',
    'MAN MADE(building collapse)': 'COLLAPSE',
    'MAN MADE(DROWNING)': 'DROWNING',
    'flood': 'FLOOD',
    'flooding': 'FLOOD',
    'domestic fire': 'FIRE',
    'rainstorm': 'RAIN_STORM',
    'windstorm': 'WIND_STORM',
    'tidal wave': 'TIDAL_WAVE',
    'man made': 'MAN_MADE',
    'building collapse': 'COLLAPSE',
    'landslide': 'LANDSLIDE',
    'epidemics (avian flu)': 'EPIDEMIC',
    'bird flu': 'EPIDEMIC',
    'drowning': 'DROWNING',
    'galamsey pit collapse': 'COLLAPSE',
    'road accident': 'ACCIDENT',
    'chemical explosion': 'EXPLOSION',
    'gas explosion': 'EXPLOSION',
    'pest&insect infestation': 'PEST_INFESTATION',
    'insect/pesticide': 'PEST_INFESTATION',
    'pest infestation': 'PEST_INFESTATION',
    'lightening': 'LIGHTNING',
    'bush fire': 'FIRE',
    'commercial fire': 'FIRE',
    'tidal waves': 'TIDAL_WAVE',
    'man-made': 'MAN_MADE',
    'man made': 'MAN_MADE'
}

# Apply the mapping
df['disaster_type'] = df['disaster_type'].replace(disaster_mapping)

# Handle missing locations
df['location'] = df['location'].fillna('UNKNOWN')

# Check for any remaining issues
print(f"\nNumber of rows: {len(df)}")
print(f"Number of unique disaster types: {df['disaster_type'].nunique()}")
print(f"Number of unique locations: {df['location'].nunique()}")

# Display unique disaster types
print("\nUnique disaster types:")
print(df['disaster_type'].value_counts())

# Display sample of processed data
print("\nSample of processed data:")
print(df.head())

# Save cleaned data
df.to_csv('../data/processed/cleaned_disaster_data.csv', index=False)
print("\nData cleaning complete and saved to '../data/processed/cleaned_disaster_data.csv'")

First 10 rows of the dataset:
                     0              1               2                 3  \
0                 Date  disaster_type  Incident_Count          Location   
1            16/7/2021  DOMESTIC FIRE               1            ESIAMA   
2  2019-11-02 00:00:00      WINDSTORM               1     AGONA NKWANTA   
3             21/03/19      WINDSTORM               1           MPATASE   
4             22/03/19      WINDSTORM               1             ABURA   
5             23/03/19      WINDSTORM               1           AKWIDAA   
6              25/4/19      WINDSTORM               1  BONWIRE JUNCTION   
7  2019-01-05 00:00:00      WINDSTORM               1            AHOBRE   
8  2019-01-05 00:00:00      WINDSTORM               1            ESIAMA   
9              21/5/19      WINDSTORM               1            ESIAMA   

                4  
0  Severity_Index  
1               1  
2               1  
3               1  
4               1  
5               1  
6   

  return pd.to_datetime(date_str, errors='coerce')



Number of invalid dates: 10

Number of rows: 633
Number of unique disaster types: 21
Number of unique locations: 475

Unique disaster types:
disaster_type
FIRE                            201
FLOOD                           164
RAIN_STORM                      116
WIND_STORM                       88
MAN_MADE                         22
TIDAL_WAVE                        8
COLLAPSE                          5
ACCIDENT                          5
LANDSLIDE                         4
DROWNING                          3
EPIDEMIC                          3
MAN MADE(BUILDING COLLAPSE)       2
PEST_INFESTATION                  2
EXPLOSION                         2
COMERCIAL FIRE                    2
DISASTER_TYPE                     1
PEST INFESTATION                  1
LIGHTNING                         1
INDUSTRIAL FIRE                   1
DROWN                             1
MAN MADE (BUILDING COLLAPSE)      1
Name: count, dtype: int64

Sample of processed data:
        date  disaster_type        