<a href="https://colab.research.google.com/github/emmanuelokellootieno-afk/nairobi-urban-expansion-geoai/blob/main/EDA_fo_nairobi_urban_areas_per_subcounty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Exploratory Data Analysis (EDA) for Nairobi Urban Areas per Subcounty
# This notebook analyzes the nairobi_urban_areas_per_subcounty.csv file
# Run this in Google Colab

# Install required packages if needed (seaborn for better plots)
!pip install seaborn

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Load data from Google Drive in Google Colab
# This assumes the CSV file 'nairobi_urban_areas_per_subcounty.csv' is saved in your Google Drive's root folder
# Adjust the file path if it's in a subfolder (e.g., '/content/drive/My Drive/MyFolder/filename.csv')

# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Import pandas and load the CSV
import pandas as pd

# Load the CSV from Drive
file_path = '/content/drive/My Drive/Data/nairobi_urban_areas_per_subcounty.csv'  # Update path if needed
df = pd.read_csv(file_path)

# Verify loading
print("Data loaded successfully!")
print(f"Shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())

# Optional: Save a copy to Colab's local session for faster access
df.to_csv('nairobi_urban_areas_per_subcounty_local.csv', index=False)
print("\nLocal copy saved as 'nairobi_urban_areas_per_subcounty_local.csv'")

In [None]:
# Basic EDA
print("\n" + "="*50)
print("BASIC INFORMATION")
print("="*50)
print(f"Columns: {df.columns.tolist()}")
print(f"Data types:\n{df.dtypes}")
print(f"Missing values:\n{df.isnull().sum()}")
print(f"Unique subcounties: {df['Subcounty'].nunique()}")
print(f"Unique years: {sorted(df['Year'].unique())}")

print("\n" + "="*50)
print("DESCRIPTIVE STATISTICS")
print("="*50)
print(df.describe())

In [None]:
# Pivot the data for easier analysis (wide format: years as columns, subcounties as rows)
pivot_df = df.pivot(index='Subcounty', columns='Year', values='Urban_Area_km2').fillna(0)
print("\nPivot table shape:", pivot_df.shape)
print("\nPivot head:")
print(pivot_df.head())

# Calculate growth metrics
pivot_df['Total_Growth_km2'] = pivot_df[2024] - pivot_df[2017]
pivot_df['Total_Growth_Pct'] = (pivot_df['Total_Growth_km2'] / pivot_df[2017]) * 100
pivot_df['Avg_Annual_Growth_Rate'] = ((pivot_df[2024] / pivot_df[2017]) ** (1/7) - 1) * 100  # CAGR over 7 years

print("\nGrowth metrics head:")
print(pivot_df[['Total_Growth_km2', 'Total_Growth_Pct', 'Avg_Annual_Growth_Rate']].head())


In [None]:
# Overall trends
print("\n" + "="*50)
print("OVERALL TRENDS")
print("="*50)
annual_totals = df.groupby('Year')['Urban_Area_km2'].sum().reset_index()
print("Annual total urban area (km²):")
print(annual_totals)

# Plot 1: Total urban area over time
plt.figure(figsize=(10, 6))
plt.plot(annual_totals['Year'], annual_totals['Urban_Area_km2'], marker='o', linewidth=2, markersize=8)
plt.title('Total Urban Area in Nairobi Metropolitan Region (2017-2024)')
plt.xlabel('Year')
plt.ylabel('Total Urban Area (km²)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Plot 2: Urban area trends for top 10 fastest-growing subcounties
top_growing = pivot_df.nlargest(10, 'Total_Growth_Pct')
top_growing_years = top_growing.drop(columns=['Total_Growth_km2', 'Total_Growth_Pct', 'Avg_Annual_Growth_Rate'])

plt.figure(figsize=(12, 8))
top_growing_years.T.plot(linewidth=2, marker='o')
plt.title('Urban Area Growth: Top 10 Subcounties (2017-2024)')
plt.xlabel('Year')
plt.ylabel('Urban Area (km²)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Plot 3: Distribution of total growth percentages
plt.figure(figsize=(10, 6))
plt.hist(pivot_df['Total_Growth_Pct'], bins=10, edgecolor='black', alpha=0.7)
plt.title('Distribution of Total Urban Growth % Across Subcounties (2017-2024)')
plt.xlabel('Total Growth (%)')
plt.ylabel('Number of Subcounties')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Plot 4: Heatmap of urban areas by subcounty and year
plt.figure(figsize=(14, 10))
sns.heatmap(pivot_df.drop(columns=['Total_Growth_km2', 'Total_Growth_Pct', 'Avg_Annual_Growth_Rate']).T,
            annot=False, cmap='YlOrRd', cbar_kws={'label': 'Urban Area (km²)'})
plt.title('Heatmap: Urban Areas by Subcounty and Year')
plt.xlabel('Subcounty')
plt.ylabel('Year')
plt.tight_layout()
plt.show()

# Plot 5: Bar chart of average annual growth rates
plt.figure(figsize=(12, 8))
top_rates = pivot_df.nlargest(15, 'Avg_Annual_Growth_Rate')[['Avg_Annual_Growth_Rate']]
top_rates.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Average Annual Growth Rates: Top 15 Subcounties')
plt.xlabel('Subcounty')
plt.ylabel('Avg Annual Growth Rate (%)')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix (between years)
year_corr = pivot_df.drop(columns=['Total_Growth_km2', 'Total_Growth_Pct', 'Avg_Annual_Growth_Rate']).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(year_corr, annot=True, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix: Urban Areas Across Years')
plt.tight_layout()
plt.show()

# Summary statistics for growth
print("\n" + "="*50)
print("GROWTH SUMMARY")
print("="*50)
print(f"Average total growth across subcounties: {pivot_df['Total_Growth_Pct'].mean():.2f}%")
print(f"Median total growth: {pivot_df['Total_Growth_Pct'].median():.2f}%")
print(f"Std dev of total growth: {pivot_df['Total_Growth_Pct'].std():.2f}%")
print(f"Fastest growing subcounty: {pivot_df.loc[pivot_df['Total_Growth_Pct'].idxmax(), 'Total_Growth_Pct']:.2f}% ({pivot_df['Total_Growth_Pct'].idxmax()})")
print(f"Slowest growing subcounty: {pivot_df.loc[pivot_df['Total_Growth_Pct'].idxmin(), 'Total_Growth_Pct']:.2f}% ({pivot_df['Total_Growth_Pct'].idxmin()})")

In [None]:
# Save enhanced dataset with growth metrics
pivot_df.reset_index().to_csv('nairobi_urban_areas_enhanced.csv', index=False)
print("\nEnhanced dataset saved as 'nairobi_urban_areas_enhanced.csv'")