In [None]:
import geopandas as gpd
import osmnx as ox
import pandas as pd

In [None]:
file_path = "../Data/"
output_file_path = file_path + "Output/"

# Import human settlement shapefile from GHS UCDB
Source: GHS Urban Centre Database 2015
https://ghsl.jrc.ec.europa.eu/download.php?ds=ucdb 

In [None]:
UCDB_gdf = gpd.read_file(file_path + "GHS_STAT_UCDB2015MT_GLOBE_R2019A_V1_2.gpkg")

In [None]:
# Select the polygon that is the human settlement area of LA: plot it, you can see there are holes in it
LA_UCDB_gdf = UCDB_gdf[UCDB_gdf["UC_NM_MN"] == "Los Angeles"]
LA_UCDB_gdf.plot()

# Import TIGERline census tract shapefile

In [None]:
# census tracts in California
CA_22tract = gpd.read_file(file_path + "tl_2022_06_tract/tl_2022_06_tract.shp")
LA_UCDB_gdf.crs == CA_22tract.crs

In [None]:
# project the CRS to be the same - 4326
CA_22tract_crs = CA_22tract.to_crs(4326)
LA_UCDB_gdf.crs == CA_22tract_crs.crs

In [None]:
# selecting census tracts in LA county
LA_22tract = CA_22tract_crs[CA_22tract_crs["COUNTYFP"] == "037"]

# Get the shapefile of intersection of LA county and LA human settlement area

In [None]:
# Plot the difference of two shapefile
ax = LA_22tract.plot(color="palegreen", edgecolor="green", figsize=(20, 10))
LA_UCDB_gdf.plot(ax=ax, color="red")

In [None]:
# Get the intersection proportion of these two shapefiles
LA_clip = gpd.clip(LA_22tract, LA_UCDB_gdf)
LA_clip_whole = LA_clip.dissolve()

In [None]:
LA_clip_whole.plot()

# Get the convex hull of it

In [None]:
LA_clip_convex = LA_clip_whole.convex_hull

In [None]:
# plot the convex hull of the intersected shapefile, which is our study area
ax = LA_clip_convex.plot(color="palegreen", edgecolor="green", figsize=(20, 10))
LA_clip_whole.plot(ax=ax, color="red")

In [None]:
LA_clip_convex.to_file(output_file_path + "LA_clip_convex.shp")

# get street network for analysis
### Using the above convex hull shapefile are our study region
### Get strongly connected graph

In [None]:
LA_clip_convex_gdf = gpd.GeoDataFrame(geometry=gpd.GeoSeries(LA_clip_convex))
LA_clip_convex_polygon = LA_clip_convex_gdf.iloc[0]["geometry"]

In [None]:
G = ox.graph_from_polygon(LA_clip_convex_polygon, network_type="drive")
ox.plot_graph(G)

In [None]:
# get strongly connected graph
G = ox.utils_graph.get_largest_component(G, strongly=True)

In [None]:
# Save Geopackages
ox.save_graph_geopackage(G, filepath=output_file_path + "LA_clip_convex_strong_network.gpkg")
# Save Graphml
ox.save_graphml(G, filepath=output_file_path + "LA_clip_convex_strong_network.graphml")

# Attach information to all the nodes in the street network

In [None]:
gdf_nodes, gdf_edges = ox.graph_to_gdfs(G)
gdf = gdf_nodes.reset_index(drop=False)

In [None]:
gdf_proj = ox.project_gdf(gdf, to_latlong=True)
gdf_proj["x"] = gdf_proj["geometry"].x
gdf_proj["y"] = gdf_proj["geometry"].y

In [None]:
# Los Angeles census tract used by uber movements
uber_tract = gpd.read_file(file_path + "los_angeles_censustracts.json")

In [None]:
# attach information on
selected_cols = ["osmid", "y", "x", "highway", "street_count", "ref", "geometry", "GEOID"]
gdf_proj_tract = gpd.sjoin(gdf_proj, CA_22tract_crs, how="left", predicate="within")[selected_cols]
gdf_proj_tract_uber = gpd.sjoin(gdf_proj_tract, uber_tract, how="left", predicate="within")[
    selected_cols + ["MOVEMENT_ID", "TRACT"]
]
gdf_proj_tract_uber

In [None]:
# As it is possible that a node locates on the borderline of uber movement tracts, I only keep the first matched information on uber tract.
gdf_proj_tract_uber_dedup = gdf_proj_tract_uber.drop_duplicates(subset=["osmid"], keep="first")


# Sample 5,000,000 nodes as origin and destination (5,000,000 OD pairs) in potential nodes candidates

In [None]:
gdf_proj.to_csv(output_file_path + "nodes_candidate_convex_strongly.csv")
gdf_proj_tract_uber_dedup.to_csv(
    output_file_path + "nodes_candidate_convex_strongly_attributes.csv",
)

In [None]:
# I randomly sampled with replacement as there are fewer than 5 million nodes in the studied street networks
origin = gdf_proj_tract_uber_dedup.sample(5000000, random_state=123, replace=True).copy()
destin = gdf_proj_tract_uber_dedup.sample(5000000, random_state=321, replace=True).copy()
origin = origin.reset_index(drop=True)
destin = destin.reset_index(drop=True)

In [None]:
origin_od = origin[["osmid", "y", "x", "GEOID", "MOVEMENT_ID"]]
origin_od.columns = ["oid", "oy", "ox", "oGEOID", "oMOVEMENT_ID"]
destin_od = destin[["osmid", "y", "x", "GEOID", "MOVEMENT_ID"]]
destin_od.columns = ["did", "dy", "dx", "dGEOID", "dMOVEMENT_ID"]
temp_OD = pd.concat([origin_od, destin_od], sort=False, axis=1)

In [None]:
# Drop the OD pairs where the origin and destination are the same node
origin_dedup = origin.drop(temp_OD[temp_OD["oid"] == temp_OD["did"]].index)
destin_dedup = destin.drop(temp_OD[temp_OD["oid"] == temp_OD["did"]].index)

In [None]:
temp_OD = temp_OD.drop(temp_OD[temp_OD["oid"] == temp_OD["did"]].index)
temp_OD.to_csv(output_file_path + "OD_5m_strong.csv")

In [None]:
# origin_dedup.to_file(output_file_path + 'origin_5m.shp')
# destin_dedup.to_file(output_file_path + 'destin_5m.shp')

# Subset OD pairs that have reference to uber movement 2020

In [None]:
temp_OD["oMOVEMENT_ID"] = temp_OD["oMOVEMENT_ID"].astype(float)
temp_OD["dMOVEMENT_ID"] = temp_OD["dMOVEMENT_ID"].astype(float)

In [None]:
temp_OD["uber_OD"] = list(zip(temp_OD.oMOVEMENT_ID, temp_OD.dMOVEMENT_ID, strict=False))

In [None]:
uber_2020 = pd.read_csv(
    file_path + "los_angeles-censustracts-2020-1-All-HourlyAggregate.csv",
    dtype={"sourceid": float, "dstid": float},
)

In [None]:
# Select the OD pairs that have reference in uber movement 2020
unique_OD = set(list(zip(uber_2020.sourceid, uber_2020.dstid, strict=False)))
OD_pairs_uber = temp_OD[temp_OD["uber_OD"].isin(unique_OD)]

In [None]:
OD_pairs_uber

In [None]:
# sample 1 million OD pairs out of 1,197,651 OD pairs that have reference with 202 uber movement data
OD_pairs_uber_1m = OD_pairs_uber.sample(1000000, random_state=123).copy()

In [None]:
OD_pairs_uber_1m.to_csv(output_file_path + "OD_pairs_uber_1m_strongly.csv")

In [None]:
uber_pairs_remains = OD_pairs_uber[~OD_pairs_uber.index.isin(OD_pairs_uber_1m.index)]

In [None]:
# Save all not sampled OD pairs that have a reference in 2020 ubermovement to csv
uber_pairs_remains.to_csv(output_file_path + "OD_pairs_uber_remains_strongly_19w.csv")

In [None]:
# Save all the sampled OD pairs that have a reference in 2020 ubermovement to csv
OD_pairs_uber.to_csv(output_file_path + "OD_pairs_uber_all_strongly_119w.csv")

In [None]:
OD_pairs_remains = temp_OD[~temp_OD.index.isin(OD_pairs_uber_1m.index)]
OD_pairs_remains.to_csv(output_file_path + "OD_pairs_uber_4m_remains_strongly.csv")