# Geospatial Flood Prediction Data Analysis

This notebook performs geospatial analysis for flood prediction using Python libraries.


In [None]:
# Import required libraries for geospatial flood prediction analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point, Polygon
import folium
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# Load and prepare geospatial data for flood prediction
# Note: Replace with actual data sources for real analysis

# Sample data structure for flood prediction
sample_data = {
    'latitude': [40.7128, 40.7589, 40.7614, 40.7505, 40.7482],
    'longitude': [-74.0060, -73.9851, -73.9776, -73.9934, -73.9857],
    'elevation': [10.5, 15.2, 8.7, 12.3, 9.8],
    'rainfall_mm': [45.2, 38.7, 52.1, 41.3, 48.9],
    'soil_type': ['clay', 'sandy', 'loam', 'clay', 'sandy'],
    'distance_to_water': [0.5, 1.2, 0.3, 0.8, 0.6],
    'flood_risk': [0.8, 0.4, 0.9, 0.6, 0.7]
}

df = pd.DataFrame(sample_data)
print("Sample flood prediction data:")
print(df.head())

In [None]:
# Create a GeoDataFrame for spatial analysis
geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry)
gdf.crs = 'EPSG:4326'  # WGS84 coordinate system

print("GeoDataFrame created:")
print(gdf.head())

In [None]:
# Visualize the geospatial data
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Elevation vs Flood Risk
axes[0, 0].scatter(df['elevation'], df['flood_risk'], c='blue', alpha=0.6)
axes[0, 0].set_xlabel('Elevation (m)')
axes[0, 0].set_ylabel('Flood Risk')
axes[0, 0].set_title('Elevation vs Flood Risk')

# Plot 2: Rainfall vs Flood Risk
axes[0, 1].scatter(df['rainfall_mm'], df['flood_risk'], c='green', alpha=0.6)
axes[0, 1].set_xlabel('Rainfall (mm)')
axes[0, 1].set_ylabel('Flood Risk')
axes[0, 1].set_title('Rainfall vs Flood Risk')

# Plot 3: Distance to Water vs Flood Risk
axes[1, 0].scatter(df['distance_to_water'], df['flood_risk'], c='red', alpha=0.6)
axes[1, 0].set_xlabel('Distance to Water (km)')
axes[1, 0].set_ylabel('Flood Risk')
axes[1, 0].set_title('Distance to Water vs Flood Risk')

# Plot 4: Soil Type Distribution
soil_counts = df['soil_type'].value_counts()
axes[1, 1].pie(soil_counts.values, labels=soil_counts.index, autopct='%1.1f%%')
axes[1, 1].set_title('Soil Type Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Create an interactive map for flood risk visualization
def create_flood_risk_map(gdf):
    # Center the map on the mean coordinates
    center_lat = gdf['latitude'].mean()
    center_lon = gdf['longitude'].mean()
    
    # Create base map
    m = folium.Map(location=[center_lat, center_lon], zoom_start=12)
    
    # Add markers for each location with flood risk information
    for idx, row in gdf.iterrows():
        # Color based on flood risk level
        if row['flood_risk'] > 0.7:
            color = 'red'
        elif row['flood_risk'] > 0.5:
            color = 'orange'
        else:
            color = 'green'
        
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=10,
            popup=f"Flood Risk: {row['flood_risk']:.2f}<br>Elevation: {row['elevation']}m<br>Rainfall: {row['rainfall_mm']}mm",
            color=color,
            fill=True,
            fillColor=color
        ).add_to(m)
    
    return m

# Create and display the map
flood_map = create_flood_risk_map(gdf)
print("Interactive flood risk map created!")
# flood_map  # Uncomment to display in Jupyter notebook

In [None]:
# Prepare data for machine learning model
# Encode categorical variables
df_encoded = df.copy()
df_encoded = pd.get_dummies(df_encoded, columns=['soil_type'], prefix='soil')

# Select features for prediction
feature_columns = ['latitude', 'longitude', 'elevation', 'rainfall_mm', 
                   'distance_to_water', 'soil_clay', 'soil_loam', 'soil_sandy']
X = df_encoded[feature_columns]
y = df_encoded['flood_risk']

print("Features for ML model:")
print(X.head())
print(f"\nTarget variable (flood_risk): {y.head().tolist()}")

In [None]:
# Train a Random Forest model for flood prediction
# Note: This is a simple example with limited data
# In practice, you would need much more training data

# For demonstration purposes, let's create more synthetic data
np.random.seed(42)
n_samples = 100

# Generate synthetic training data
synthetic_data = {
    'latitude': np.random.uniform(40.7, 40.8, n_samples),
    'longitude': np.random.uniform(-74.1, -73.9, n_samples),
    'elevation': np.random.uniform(5, 20, n_samples),
    'rainfall_mm': np.random.uniform(30, 60, n_samples),
    'distance_to_water': np.random.uniform(0.1, 2.0, n_samples),
    'soil_clay': np.random.choice([0, 1], n_samples),
    'soil_loam': np.random.choice([0, 1], n_samples),
    'soil_sandy': np.random.choice([0, 1], n_samples)
}

# Calculate synthetic flood risk based on features
# Higher rainfall, lower elevation, closer to water = higher risk
flood_risk_synthetic = (
    0.4 * (synthetic_data['rainfall_mm'] - 30) / 30 +
    0.3 * (20 - synthetic_data['elevation']) / 15 +
    0.2 * (2.0 - synthetic_data['distance_to_water']) / 2.0 +
    0.1 * synthetic_data['soil_clay'] +
    np.random.normal(0, 0.1, n_samples)  # Add some noise
)
flood_risk_synthetic = np.clip(flood_risk_synthetic, 0, 1)  # Ensure values are between 0 and 1

X_synthetic = pd.DataFrame(synthetic_data)
y_synthetic = flood_risk_synthetic

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_synthetic, y_synthetic, test_size=0.2, random_state=42
)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance in Flood Risk Prediction')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("Feature Importance:")
print(feature_importance)

In [None]:
# Function to predict flood risk for new locations
def predict_flood_risk(latitude, longitude, elevation, rainfall_mm, 
                      distance_to_water, soil_type):
    """
    Predict flood risk for a given location and conditions.
    
    Parameters:
    - latitude: float
    - longitude: float  
    - elevation: float (meters)
    - rainfall_mm: float (millimeters)
    - distance_to_water: float (kilometers)
    - soil_type: str ('clay', 'loam', 'sandy')
    
    Returns:
    - flood_risk: float (0-1 scale)
    """
    # Create input data
    input_data = {
        'latitude': [latitude],
        'longitude': [longitude],
        'elevation': [elevation],
        'rainfall_mm': [rainfall_mm],
        'distance_to_water': [distance_to_water],
        'soil_clay': [1 if soil_type == 'clay' else 0],
        'soil_loam': [1 if soil_type == 'loam' else 0],
        'soil_sandy': [1 if soil_type == 'sandy' else 0]
    }
    
    input_df = pd.DataFrame(input_data)
    prediction = rf_model.predict(input_df)[0]
    
    return max(0, min(1, prediction))  # Ensure result is between 0 and 1

# Example prediction
example_risk = predict_flood_risk(
    latitude=40.7128,
    longitude=-74.0060,
    elevation=8.0,
    rainfall_mm=50.0,
    distance_to_water=0.3,
    soil_type='clay'
)

print(f"Example flood risk prediction: {example_risk:.3f}")
print(f"Risk level: {'High' if example_risk > 0.7 else 'Medium' if example_risk > 0.4 else 'Low'}")

## Summary

This notebook demonstrates a basic workflow for geospatial flood prediction analysis:

1. **Data Loading**: Import geospatial and environmental data
2. **Visualization**: Create maps and charts to understand flood risk patterns
3. **Feature Engineering**: Prepare data for machine learning
4. **Model Training**: Use Random Forest to predict flood risk
5. **Evaluation**: Assess model performance and feature importance
6. **Prediction**: Create functions for real-time flood risk assessment

### Next Steps for Real-World Application:

- Integrate with real weather APIs and elevation data
- Add more sophisticated geospatial features (slope, drainage patterns)
- Include historical flood data for training
- Implement real-time monitoring and alerts
- Use more advanced models (e.g., neural networks, ensemble methods)
- Add temporal analysis for seasonal flood patterns
