In [32]:
import pandas as pd
import geopandas as gpd

In [82]:
# --- Set all file paths here ---
path_census = "SMS_Input_-_Motor_Carrier_Census_Information_20250919.csv"
path_company_scores = "Latest_company_fit_scores.csv"
path_geocoded = "geocoded_addresses.parquet"
path_cargo = "cargo_with_categories.parquet"
path_insurance = "insurance_summary.parquet"
path_accident = "fars_crss_census.parquet"
path_dqs = "dqs_output.csv"

In [84]:
#Import data
census = pd.read_csv(path_census)

company_fit_scores = pd.read_csv(path_company_scores)

geocoded = pd.read_parquet(path_geocoded)

cargo_carried_with_categories = pd.read_parquet(path_cargo)

insurance = pd.read_parquet(path_insurance)

accident = pd.read_parquet(path_accident)

dqs = pd.read_csv(path_dqs)

In [85]:
#Change DOT IDs to strings
census['DOT_NUMBER'] = census['DOT_NUMBER'].astype(str)

company_fit_scores['dot_number'] = company_fit_scores['dot_number'].astype(str)
company_fit_scores = company_fit_scores.rename(
    columns={'dot_number': 'dot_number_fit_score'}
)

geocoded['id'] = geocoded['id'].astype(str)

cargo_carried_with_categories['dot_number'] = cargo_carried_with_categories['dot_number'].astype(str)
cargo_carried_with_categories = cargo_carried_with_categories.rename(
    columns={'dot_number': 'dot_number_cargo_carried'}
)

accident['DOT_NUMBER'] = accident['DOT_NUMBER'].astype(str)

dqs['dot_number'] = dqs['dot_number'].astype(str)

In [86]:

#Merge census and fit scores
census_with_fit = census.merge(
    company_fit_scores,
    how="left",
    left_on="DOT_NUMBER",
    right_on="dot_number_fit_score"
)

#Merge census and geocoded

with_geocoded = census_with_fit.merge(
    geocoded,
    how="left",
    left_on="DOT_NUMBER",
    right_on="id"
)

#Merge in cargo carried
with_cargo = with_geocoded.merge(
    cargo_carried_with_categories,
    how="left",
    left_on="DOT_NUMBER",
    right_on="dot_number_cargo_carried"
)

#Merge insurance

def make_dot_key(s):
    return (
        s.astype(str)
         .str.strip()
         .str.lstrip("0")
         .replace("", pd.NA)
    )

with_cargo_key  = make_dot_key(with_cargo["DOT_NUMBER"])
insurance_key = make_dot_key(insurance["usdot_number"])

census2  = with_cargo.assign(_DOT_KEY=with_cargo_key)
insurance2 = insurance.assign(_DOT_KEY=insurance_key)

with_insurance = census2.merge(
    insurance2,
    on="_DOT_KEY",
    how="left",
    suffixes=("_census", "_summary"),
    validate="m:1"
)

#merge in accident data
with_accident = with_insurance.merge(
    accident,
    how="left",
    left_on="DOT_NUMBER",
    right_on="DOT_NUMBER"
)


#merge in dqs
merged = with_accident.merge(
    dqs,
    how="left",
    left_on="DOT_NUMBER",
    right_on="dot_number"
)



In [87]:
#Attach county fips data

COUNTY_SHP_PATH = "tl_2024_us_county.shp"

# Make sure lat/lon columns exist and are numeric
merged["lat"] = pd.to_numeric(merged["lat"], errors="coerce")
merged["lon"] = pd.to_numeric(merged["lon"], errors="coerce")

# 2) Load county polygons (US counties)
counties = gpd.read_file(COUNTY_SHP_PATH)[["STATEFP", "COUNTYFP", "GEOID", "NAME", "geometry"]]

# Reproject counties to WGS84 (lat/lon) to match your data
counties = counties.to_crs("EPSG:4326")

# 3) Build a GeoDataFrame of points for rows with lat/lon
mask = merged["lat"].notna() & merged["lon"].notna()

points = gpd.GeoDataFrame(
    merged.loc[mask].copy(),
    geometry=gpd.points_from_xy(merged.loc[mask, "lon"], merged.loc[mask, "lat"]),
    crs="EPSG:4326",
)

# 4) Spatial join: point -> county
# predicate="within" says: which polygon (county) is this point inside?
joined = gpd.sjoin(points, counties, how="left", predicate="within")

# 5) Create new columns in the full merged (start as missing)
merged["county_fips"] = pd.NA           # full 5-digit county FIPS (GEOID)
merged["county_name"] = pd.NA           # county name
merged["county_statefp"] = pd.NA        # 2-digit state FIPS code

# 6) Fill them for rows that had lat/lon
merged.loc[joined.index, "county_fips"] = joined["GEOID"]
merged.loc[joined.index, "county_name"] = joined["NAME"]
merged.loc[joined.index, "county_statefp"] = joined["STATEFP"]

print("Done. Rows:", len(merged))
print("With county match:", merged["county_fips"].notna().sum())


Done. Rows: 2091643
With county match: 1884503


In [92]:
# Remove unneeded columns
master_file = merged.drop(columns=["_DOT_KEY", "id", "all_dockets", "docket_unique_count", "p25_gap_days", "p75_gap_days", "p25_gap_to_cancelled", 
                                   "p25_gap_to_name_changed", "p25_gap_to_replaced", "p25_gap_to_transferred", "p75_gap_to_cancelled", "p75_gap_to_name_changed",
                                   "p75_gap_to_replaced", "p75_gap_to_transferred",
                                  "accidents_per_truck","accidents_per_driver","accidents_per_million_miles","crss_rate_per_100_trucks",
                                   "crss_rate_per_100_drivers","crss_rate_per_1m_miles","exposure_score","accident_exposure_score",
                                   "dot_number_cargo_carried","dot_number_fit_score","dot_number"])

In [None]:
#export to parquet
master_file.to_parquet("master_file.parquet")

In [None]:
#export shapefile as geojson
counties.to_file("tl_2024_us_county.geojson", driver="GeoJSON")