<a href="https://colab.research.google.com/github/floranuta/Data_Circle/blob/Tetiana/notebooks/Task13_GeospatialAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Task 1.3: Geospatial Analysis
- **Ticket 1.3.1**: Create maps of water pump locations colored by functionality status
  - Use geopandas/folium to visualize pump locations on Tanzania map
  - Analyze geographic clusters of functional/non-functional pumps
  
- **Ticket 1.3.2**: Analyze regional patterns in water pump functionality
  - Create visualizations showing functionality rates by region/district
  - Identify areas with unusually high failure rates
  
- **Ticket 1.3.3**: Investigate relationships between geography and other features
  - Analyze how water source types vary by region
  - Explore relationships between elevation (gps_height) and functionality
  
- **Ticket 1.3.4**: Create geospatial features
  - Calculate distances to nearest city/population center if data available
  - Generate region-level aggregated statistics


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd

In [3]:
from google.colab import drive
drive.mount('/content/drive')
# Path of the file to read
#csv_file_path = "D:/REDI/Data_Circle/data/training_set_values.csv"
csv_file_path = "/content/drive/MyDrive/Colab Notebooks/training_set_values.csv"
# Fill in the line below to read the file into a variable home_data
pump_data = pd.read_csv(csv_file_path)

Mounted at /content/drive


In [4]:
pump_data_labels=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/training_set_labels.csv")

In [5]:
mask_invalid = (
    (pump_data["latitude"].isna()) |
    (pump_data["longitude"].isna()) |
    (pump_data["latitude"] < -12) |
    (pump_data["latitude"] > 0) |
    (pump_data["longitude"] < 29) |
    (pump_data["longitude"] > 41)
)

invalid_rows = pump_data[mask_invalid]
print(f"Number of invalid rows: {len(invalid_rows)}")
pump_data.drop(invalid_rows.index, inplace=True)
print(f"After deletion: {len(pump_data)} rows remain")

cols_to_drop = ["wpt_name","waterpoint_type_group", "source_class", "source","quantity_group", "quality_group", "payment_type","management_group", "extraction_type_group", "extraction_type_class","scheme_name", "recorded_by", "public_meeting","num_private", "date_recorded","scheme_name"]
pump_data.drop(columns=cols_to_drop, inplace=True)
pump_data_merged = pd.merge(pump_data, pump_data_labels, on="id", how="left")

Number of invalid rows: 1812
After deletion: 57588 rows remain


In [6]:
mask_nan = pump_data["latitude"].isna() | pump_data["longitude"].isna()
nan_rows = pump_data[mask_nan]

print("Rows with NaN latitude or longitude:", len(nan_rows))
display(nan_rows.head())

Rows with NaN latitude or longitude: 0


Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,...,scheme_management,permit,construction_year,extraction_type,management,payment,water_quality,quantity,source_type,waterpoint_type


In [7]:
# Check how many rows have latitude=0 or longitude=0
mask_zero = (pump_data["latitude"] == 0) | (pump_data["longitude"] == 0)
zero_rows = pump_data[mask_zero]

print("Rows with 0 latitude or longitude:", len(zero_rows))
display(zero_rows.head())

Rows with 0 latitude or longitude: 0


Unnamed: 0,id,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,...,scheme_management,permit,construction_year,extraction_type,management,payment,water_quality,quantity,source_type,waterpoint_type


In [8]:
# IDs in pump_data but not in pump_data_labels
missing_ids = pump_data.loc[~pump_data["id"].isin(pump_data_labels["id"]), "id"]

print("Number of pump_data IDs without label:", len(missing_ids))
print(missing_ids.head())

Number of pump_data IDs without label: 0
Series([], Name: id, dtype: int64)


In [9]:
pip install geopandas folium pandas



In [13]:
import pandas as pd
import geopandas as gpd
import folium
tanzania = gpd.read_file("https://datahub.io/core/geo-countries/r/countries.geojson")
color_map = {
    "functional": "green",
    "non functional": "red",
    "needs repair": "yellow",
    "need to be repaired": "yellow"
}
center_lat = pump_data_merged ["latitude"].mean()
center_lon = pump_data_merged ["longitude"].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=6, tiles="OpenStreetMap")

# 4) Добавляем точки
for _, r in pump_data_merged .iterrows():
    folium.CircleMarker(
        location=[r["latitude"], r["longitude"]],
        radius=2.5,
        color=color_map.get(str(r["status_group"]).lower(), "gray"),
        fill=True,
        fill_opacity=0.7,
        weight=0
    ).add_to(m)


In [14]:
m.save("waterpumps_points.html")

In [None]:
from IPython.display import display
display(m)

In [16]:
from google.colab import files
files.download("waterpumps_points.html")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
pump_data_merged["status_group"].unique()

array(['functional', 'non functional', 'functional needs repair'],
      dtype=object)

In [None]:
# --- inputs you set ---
#GEO_PATH   = "tanzania_regions.geojson"  # polygons of Tanzania regions
#REGION_COL = "region_name"               # name field in the GeoJSON that matches your region names
#GEO_PATH = "https://raw.githubusercontent.com/datasets/geo-admin1-us/master/data/tz.geojson"
#REGION_COL = "region"
# 1) Take your existing DataFrame with columns: region, status_group, latitude, longitude
df = pump_data_merged.copy()
# 2) Aggregate % functional by region
stats = (
    df.groupby("region")["status_group"]
      .apply(lambda s: (s == "functional").mean() * 100.0)
      .reset_index(name="functional_pct")
)
# 3) Build choropleth
GEO_PATH = "https://raw.githubusercontent.com/thadk/GeoTZ/master/TZA_adm1_mkoaTZ.geojson"  # ADM1 regions
REGION_PROP = "NAME_1"  # property in that GeoJSON
m = folium.Map(location=[-6, 35], zoom_start=6, tiles="OpenStreetMap")
folium.Choropleth(
    geo_data=GEO_PATH,
    data=stats,
    columns=["region", "functional_pct"],
    key_on=f"feature.properties.{REGION_PROP}",
    fill_color="YlGn",
    fill_opacity=0.8,
    line_opacity=0.2,
    legend_name="Functional pumps (%)",
).add_to(m)
# 5) Display / Save
m  # in Jupyter/Colab
# Optional: overlay pump points
colors = {"functional": "green", "needs repair": "yellow", "non functional": "red"}
points = folium.FeatureGroup(name="Waterpumps (points)", show=False)
for _, r in df.dropna(subset=["latitude","longitude"]).iterrows():
    folium.CircleMarker([float(r["latitude"]), float(r["longitude"])],
                        radius=2.2,
                        color=colors.get(str(r["status_group"]).lower(), "gray"),
                        fill=True, fill_opacity=0.7, weight=0).add_to(points)
points.add_to(m)
folium.LayerControl(collapsed=False).add_to(m)
display(m)
m.save("tanzania_choropleth.html")  # open this file in your browser

In [None]:
# pip install folium geopandas branca pandas


import branca.colormap as cm
import json

# --- your data ---
df = pump_data_merged.copy()  # has: region, status_group, latitude, longitude

# --- GeoJSON source (ADM1 regions of Tanzania) ---
GEO_PATH    = "https://raw.githubusercontent.com/thadk/GeoTZ/master/TZA_adm1_mkoaTZ.geojson"
REGION_PROP = "NAME_1"  # region name field in that GeoJSON

# 1) % functional by region
df["status_group"] = df["status_group"].astype(str).str.lower()
stats = (
    df.groupby("region")["status_group"]
      .apply(lambda s: (s == "functional").mean() * 100.0)
      .reset_index(name="functional_pct")
)

# 2) load polygons and MERGE stats into the GeoDataFrame
regions = gpd.read_file(GEO_PATH).to_crs("EPSG:4326")
merged = regions.merge(stats, left_on=REGION_PROP, right_on="region", how="left")

# 3) color scale (0..100) red→yellow→green
cmap = cm.LinearColormap(colors=["#d7191c", "#fdae61", "#1a9641"], vmin=0, vmax=100).to_step(10)
cmap.caption = "Functional pumps (%)"

# 4) map
m = folium.Map(location=[-6, 35], zoom_start=6, tiles="OpenStreetMap")

# 5) style function reading 'functional_pct' directly from merged GeoDataFrame
def style_fn(feature):
    v = feature["properties"].get("functional_pct", None)
    fill = cmap(v) if v is not None else "#cccccc"
    return {"fillColor": fill, "color": "#555555", "weight": 1, "fillOpacity": 0.8}

gj = folium.GeoJson(
    data=json.loads(merged.to_json()),  # safe: serialize GeoDataFrame -> GeoJSON dict
    style_function=style_fn,
    highlight_function=lambda f: {"weight": 2, "color": "#000", "fillOpacity": 0.9},
    name="Functional % (choropleth)",
)

# hover tooltip + click popup
folium.GeoJsonTooltip(
    fields=[REGION_PROP, "functional_pct"],
    aliases=["Region", "Functional %"],
    localize=True,
    sticky=False,
).add_to(gj)

folium.GeoJsonPopup(
    fields=[REGION_PROP, "functional_pct"],
    aliases=["Region", "Functional %"],
    localize=True,
    labels=True,
).add_to(gj)

gj.add_to(m)
cmap.add_to(m)

# 6) optional points layer
colors = {"functional": "green", "needs repair": "yellow", "non functional": "red"}
points = folium.FeatureGroup(name="Waterpumps (points)", show=False)
for _, r in df.dropna(subset=["latitude", "longitude"]).iterrows():
    folium.CircleMarker(
        [float(r["latitude"]), float(r["longitude"])],
        radius=2.2,
        color=colors.get(r["status_group"], "gray"),
        fill=True, fill_opacity=0.7, weight=0
    ).add_to(points)
points.add_to(m)

folium.LayerControl(collapsed=False).add_to(m)

# display / save
m
display(m)
m.save("tanzania_choropleth_new.html")
