# Street View Analysis: Sex Ratios and Infrastructure Quality

This notebook analyzes Label Studio annotations from Google Street View images across Mumbai, Delhi, and Navi Mumbai.

In [None]:
import json
import re
from pathlib import Path

import folium
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from folium.plugins import HeatMap
from scipy import stats

pd.set_option("display.max_columns", None)
sns.set_theme(style="whitegrid")

## Section 1: Data Loading & Parsing

In [None]:
DATA_DIR = Path("..") / "data"
LABELSTUDIO_DIR = Path("..") / "labelstudio"
OUTPUT_DIR = DATA_DIR / "analysis"
OUTPUT_DIR.mkdir(exist_ok=True)

ls_files = list(LABELSTUDIO_DIR.glob("google_hires_export_228611_*.json"))
print(f"Found {len(ls_files)} Label Studio export file(s)")

with open(ls_files[0]) as f:
    ls_data = json.load(f)

print(f"Total annotated tasks: {len(ls_data)}")

In [None]:
def extract_location_id(image_path: str) -> str:
    """Extract location_id from image path like 'loc_02513_h000_p+00.jpg'"""
    match = re.search(r"(loc_\d+)_h\d+_p[+-]?\d+\.jpg", image_path)
    if match:
        return match.group(1)
    return None


def parse_taxonomy_value(result_list: list, field_name: str) -> str:
    """Extract the value from a taxonomy annotation result."""
    for result in result_list:
        if result.get("from_name") == field_name:
            taxonomy = result.get("value", {}).get("taxonomy", [])
            if taxonomy and len(taxonomy) > 0 and len(taxonomy[0]) > 0:
                return taxonomy[0][0]
    return None


def convert_count_to_numeric(val: str) -> float:
    """Convert count string to numeric. Returns midpoint for '>10'."""
    if val is None:
        return None
    if val == ">10":
        return 15.0
    try:
        return float(val)
    except ValueError:
        return None

In [None]:
FIELDS = [
    "women_count",
    "men_count",
    "women_twowheeler",
    "men_twowheeler",
    "potholes",
    "litter",
    "footpath",
    "lane_markings",
    "land_use",
    "bus_station",
    "railway_station",
    "street_vendor",
    "image_quality",
]

rows = []
for task in ls_data:
    image_path = task.get("data", {}).get("image", "")
    location_id = extract_location_id(image_path)

    if not location_id:
        continue

    for ann in task.get("annotations", []):
        if ann.get("was_cancelled"):
            continue

        result_list = ann.get("result", [])
        row = {
            "task_id": task["id"],
            "annotation_id": ann["id"],
            "location_id": location_id,
            "image_path": image_path,
            "annotator_email": ann.get("completed_by", {}).get("email"),
            "created_at": ann.get("created_at"),
        }

        for field in FIELDS:
            row[field] = parse_taxonomy_value(result_list, field)

        rows.append(row)

annotations_df = pd.DataFrame(rows)
print(f"Parsed {len(annotations_df)} annotations from {annotations_df['location_id'].nunique()} locations")
annotations_df.head()

In [None]:
locations_df = pd.read_csv(DATA_DIR / "samples" / "all_cities.csv")
print(f"Locations: {len(locations_df)} rows")
print(f"Cities: {locations_df['city'].unique()}")
locations_df.head()

In [None]:
coverage_df = pd.read_csv(DATA_DIR / "coverage" / "random_sample_coverage.csv")
coverage_df = coverage_df[coverage_df["has_coverage"] == True].copy()
print(f"Coverage records with coverage: {len(coverage_df)}")
coverage_df.head()

In [None]:
df = annotations_df.merge(locations_df, on="location_id", how="left")
df = df.merge(
    coverage_df[["location_id", "capture_date", "pano_id"]],
    on="location_id",
    how="left",
)

df["women_count_num"] = df["women_count"].apply(convert_count_to_numeric)
df["men_count_num"] = df["men_count"].apply(convert_count_to_numeric)
df["women_twowheeler_num"] = df["women_twowheeler"].apply(convert_count_to_numeric)
df["men_twowheeler_num"] = df["men_twowheeler"].apply(convert_count_to_numeric)

df["total_people"] = df["women_count_num"].fillna(0) + df["men_count_num"].fillna(0)
df["sex_ratio"] = df.apply(
    lambda r: r["women_count_num"] / r["men_count_num"]
    if pd.notna(r["men_count_num"]) and r["men_count_num"] > 0
    else None,
    axis=1,
)

df["capture_year"] = df["capture_date"].str[:4]
df["capture_month"] = df["capture_date"].str[5:7]

print(f"Merged dataset: {len(df)} rows")
print(f"Missing city: {df['city'].isna().sum()}")
print(f"Missing capture_date: {df['capture_date'].isna().sum()}")
df.head()

In [None]:
df.to_csv(OUTPUT_DIR / "combined_annotations.csv", index=False)
print(f"Saved combined dataset to {OUTPUT_DIR / 'combined_annotations.csv'}")

## Section 2: Location Mapping

In [None]:
map_df = df.dropna(subset=["lat", "lon"]).drop_duplicates(subset=["location_id"])

center_lat = map_df["lat"].mean()
center_lon = map_df["lon"].mean()

city_colors = {"Mumbai": "blue", "Delhi": "red", "Navi Mumbai": "green"}

m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

for city, color in city_colors.items():
    city_df = map_df[map_df["city"] == city]
    for _, row in city_df.iterrows():
        folium.CircleMarker(
            location=[row["lat"], row["lon"]],
            radius=3,
            color=color,
            fill=True,
            fill_opacity=0.6,
            popup=f"{row['location_id']}: {city}",
        ).add_to(m)

m.save(str(OUTPUT_DIR / "annotated_locations_map.html"))
print(f"Saved map to {OUTPUT_DIR / 'annotated_locations_map.html'}")
m

In [None]:
sr_df = map_df.dropna(subset=["sex_ratio"])

m2 = folium.Map(location=[center_lat, center_lon], zoom_start=10)

heat_data = [[row["lat"], row["lon"], row["sex_ratio"]] for _, row in sr_df.iterrows()]
HeatMap(heat_data, radius=15, blur=10, max_zoom=13).add_to(m2)

m2.save(str(OUTPUT_DIR / "sex_ratio_heatmap.html"))
print(f"Saved heatmap to {OUTPUT_DIR / 'sex_ratio_heatmap.html'}")
m2

## Section 3: Sex Ratio Analysis

In [None]:
city_stats = (
    df.groupby("city")
    .agg(
        n_annotations=("annotation_id", "count"),
        n_locations=("location_id", "nunique"),
        women_mean=("women_count_num", "mean"),
        men_mean=("men_count_num", "mean"),
        sex_ratio_mean=("sex_ratio", "mean"),
        sex_ratio_median=("sex_ratio", "median"),
    )
    .round(3)
)
city_stats

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

city_order = ["Delhi", "Mumbai", "Navi Mumbai"]
plot_df = df[df["city"].isin(city_order)].copy()

ax1 = axes[0]
city_means = plot_df.groupby("city")["sex_ratio"].mean().reindex(city_order)
ax1.bar(city_order, city_means.values, color=["#e74c3c", "#3498db", "#2ecc71"])
ax1.set_ylabel("Mean Sex Ratio (Women/Men)")
ax1.set_title("Sex Ratio by City")
ax1.axhline(y=1.0, color="gray", linestyle="--", alpha=0.7)

ax2 = axes[1]
sns.boxplot(data=plot_df, x="city", y="sex_ratio", order=city_order, ax=ax2)
ax2.set_ylabel("Sex Ratio (Women/Men)")
ax2.set_title("Sex Ratio Distribution by City")
ax2.axhline(y=1.0, color="gray", linestyle="--", alpha=0.7)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / "sex_ratio_by_city.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
road_stats = (
    df.groupby("osm_type")
    .agg(
        n_annotations=("annotation_id", "count"),
        women_mean=("women_count_num", "mean"),
        men_mean=("men_count_num", "mean"),
        sex_ratio_mean=("sex_ratio", "mean"),
        sex_ratio_median=("sex_ratio", "median"),
    )
    .round(3)
)
road_stats

In [None]:
road_order = ["primary", "secondary", "tertiary", "residential"]
road_df = df[df["osm_type"].isin(road_order)].copy()

fig, ax = plt.subplots(figsize=(8, 5))
sns.boxplot(data=road_df, x="osm_type", y="sex_ratio", order=road_order, ax=ax)
ax.set_ylabel("Sex Ratio (Women/Men)")
ax.set_xlabel("Road Type")
ax.set_title("Sex Ratio by Road Type")
ax.axhline(y=1.0, color="gray", linestyle="--", alpha=0.7)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / "sex_ratio_by_road_type.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
temporal_df = df.dropna(subset=["capture_date"]).copy()

year_stats = (
    temporal_df.groupby("capture_year")
    .agg(
        n_annotations=("annotation_id", "count"),
        sex_ratio_mean=("sex_ratio", "mean"),
    )
    .round(3)
)
print("Sex ratio by capture year:")
year_stats

In [None]:
month_stats = (
    temporal_df.groupby("capture_month")
    .agg(
        n_annotations=("annotation_id", "count"),
        sex_ratio_mean=("sex_ratio", "mean"),
    )
    .round(3)
    .sort_index()
)
print("Sex ratio by capture month:")
month_stats

In [None]:
groups = [grp["sex_ratio"].dropna().values for _, grp in df.groupby("city") if len(grp["sex_ratio"].dropna()) > 0]

if len(groups) >= 2:
    stat, pvalue = stats.kruskal(*groups)
    print(f"Kruskal-Wallis test for sex ratio across cities:")
    print(f"  H-statistic: {stat:.4f}")
    print(f"  p-value: {pvalue:.4e}")
else:
    print("Not enough groups for statistical test")

## Section 4: Infrastructure Analysis

In [None]:
infra_fields = ["potholes", "litter", "footpath", "lane_markings"]

for field in infra_fields:
    print(f"\n{field} value counts:")
    print(df[field].value_counts(dropna=False))

In [None]:
def is_positive(val):
    if pd.isna(val):
        return None
    return val.lower() in ["yes", "visible", "good condition", "poor condition"]


df["has_potholes"] = df["potholes"].apply(lambda x: x == "Yes" if pd.notna(x) else None)
df["has_litter"] = df["litter"].apply(lambda x: x == "Yes" if pd.notna(x) else None)
df["has_footpath"] = df["footpath"].apply(
    lambda x: x in ["Good condition", "Poor condition"] if pd.notna(x) else None
)
df["has_lane_markings"] = df["lane_markings"].apply(
    lambda x: x in ["Yes", "Visible"] if pd.notna(x) else None
)

In [None]:
infra_city = pd.DataFrame()
for field in ["has_potholes", "has_litter", "has_footpath", "has_lane_markings"]:
    rates = df.groupby("city")[field].mean() * 100
    infra_city[field.replace("has_", "")] = rates

print("Infrastructure prevalence by city (% with feature):")
infra_city.round(1)

In [None]:
infra_road = pd.DataFrame()
for field in ["has_potholes", "has_litter", "has_footpath", "has_lane_markings"]:
    rates = df.groupby("osm_type")[field].mean() * 100
    infra_road[field.replace("has_", "")] = rates

print("Infrastructure prevalence by road type (% with feature):")
infra_road.reindex(road_order).round(1)

In [None]:
infra_cols = ["has_potholes", "has_litter", "has_footpath", "has_lane_markings"]
infra_corr = df[infra_cols].corr()

fig, ax = plt.subplots(figsize=(8, 6))
labels = ["Potholes", "Litter", "Footpath", "Lane Markings"]
sns.heatmap(
    infra_corr,
    annot=True,
    cmap="RdYlBu_r",
    center=0,
    xticklabels=labels,
    yticklabels=labels,
    ax=ax,
)
ax.set_title("Infrastructure Features Correlation")
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "infrastructure_correlation.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
print("\nLand use distribution by city:")
land_use_city = pd.crosstab(df["city"], df["land_use"], normalize="index") * 100
land_use_city.round(1)

In [None]:
print("\nLand use distribution by road type:")
land_use_road = pd.crosstab(df["osm_type"], df["land_use"], normalize="index") * 100
land_use_road.reindex(road_order).round(1)

In [None]:
summary = df.groupby("city").agg(
    n_annotations=("annotation_id", "count"),
    n_locations=("location_id", "nunique"),
    women_mean=("women_count_num", "mean"),
    women_std=("women_count_num", "std"),
    men_mean=("men_count_num", "mean"),
    men_std=("men_count_num", "std"),
    sex_ratio_mean=("sex_ratio", "mean"),
    sex_ratio_std=("sex_ratio", "std"),
    pct_with_footpath=("has_footpath", "mean"),
    pct_with_litter=("has_litter", "mean"),
    pct_with_potholes=("has_potholes", "mean"),
    pct_with_lane_markings=("has_lane_markings", "mean"),
).round(3)

summary.to_csv(OUTPUT_DIR / "city_summary.csv")
print(f"Saved city summary to {OUTPUT_DIR / 'city_summary.csv'}")
summary

In [None]:
road_summary = df.groupby("osm_type").agg(
    n_annotations=("annotation_id", "count"),
    n_locations=("location_id", "nunique"),
    women_mean=("women_count_num", "mean"),
    men_mean=("men_count_num", "mean"),
    sex_ratio_mean=("sex_ratio", "mean"),
    pct_with_footpath=("has_footpath", "mean"),
    pct_with_litter=("has_litter", "mean"),
    pct_with_potholes=("has_potholes", "mean"),
    pct_with_lane_markings=("has_lane_markings", "mean"),
).round(3)

road_summary.to_csv(OUTPUT_DIR / "road_type_summary.csv")
print(f"Saved road type summary to {OUTPUT_DIR / 'road_type_summary.csv'}")
road_summary

In [None]:
print("\n=== Analysis Complete ===")
print(f"\nTotal annotations analyzed: {len(df)}")
print(f"Unique locations: {df['location_id'].nunique()}")
print(f"\nOutput files saved to: {OUTPUT_DIR.resolve()}")
for f in OUTPUT_DIR.glob("*"):
    print(f"  - {f.name}")