In [None]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt

In [None]:
DATA_RAW = "../data_set/raw_data"
DATA_PROC = "../data_set/processed_data"
os.path.exists(DATA_PROC) or os.makedirs(DATA_PROC, exist_ok=True)
print("Ready. Raw:", DATA_RAW, "Processed:", DATA_PROC)


In [None]:
geojson_path = os.path.join(DATA_RAW, "delhi_pois.geojson")  
csv_path = os.path.join(DATA_PROC, "delhi_poi_clean.csv")


In [None]:
if os.path.exists(csv_path):
    poi_df = pd.read_csv(csv_path)
    print("Loaded CSV:", csv_path)
else:
    # try geojson
    if os.path.exists(geojson_path):
        gdf = gpd.read_file(geojson_path)
        # Normalize columns to have id,type,name,lon,lat
        def extract_type(row):
            for key in ("amenity","shop","building"):
                if key in row and row[key] not in (None, ''):
                    return row[key]
            return None

        # pick an id column if present, otherwise use the index
        id_candidates = ['osmid','osm_id','osmId','id','@id','OSM_ID']
        found_id = next((c for c in id_candidates if c in gdf.columns), None)
        if found_id:
            id_col = found_id
            print("Using id column:", id_col)
        else:
            # reset_index so we have a stable numeric id column
            gdf = gdf.reset_index().rename(columns={'index':'id'})
            id_col = 'id'
            print("No id column found; using dataframe index as 'id'")

        gdf['type'] = gdf.apply(lambda r: extract_type(r), axis=1)
        # get centroid lon/lat for points/polygons
        gdf['geometry'] = gdf.geometry.centroid
        gdf['lon'] = gdf.geometry.x
        gdf['lat'] = gdf.geometry.y

        poi_df = gdf[[id_col,'type','name','lon','lat']].rename(columns={id_col:'id'}).fillna('')
        poi_df.to_csv(csv_path, index=False)
        print("Converted GeoJSON -> CSV:", csv_path)
    else:
        raise FileNotFoundError("No delhi_poi_clean.csv or delhi_pois.geojson found in data folders.")
        
poi_df.head()

In [None]:
# Cell 3: counts by type
counts = poi_df['type'].value_counts().reset_index()
counts.columns = ['type','count']
print(counts.head(30).to_string(index=False))

# simple bar plot
plt.figure(figsize=(10,5))
top = counts.head(15)
plt.barh(top['type'][::-1], top['count'][::-1])
plt.title("Top POI types (Delhi)")
plt.xlabel("Count")
plt.tight_layout()
plt.show()


In [None]:
# Cell 4: load boundary and plot
boundary_path = os.path.join(DATA_RAW, "delhi_boundary.geojson")

if not os.path.exists(boundary_path):
    print("Boundary geojson not found at", boundary_path, "\nYou can provide one as data/raw/delhi_boundary.geojson")
else:
    boundary = gpd.read_file(boundary_path).to_crs(epsg=4326)
    # convert poi_df to GeoDataFrame
    poi_gdf = gpd.GeoDataFrame(poi_df, geometry=gpd.points_from_xy(poi_df.lon, poi_df.lat), crs="EPSG:4326")
    ax = boundary.plot(figsize=(10,10), color="none", edgecolor="black")
    poi_gdf.sample(200).plot(ax=ax, markersize=6, color='red', alpha=0.6)  # sample to keep plot light
    plt.title("Delhi boundary + sample POIs")
    plt.show()
