In [61]:
# ===== FARS assign fault + US DOT filter + census merge + PARQUET export =====
import pandas as pd
import numpy as np
from typing import List, Tuple

#Get data ready
def _clean_usdot(s: pd.Series) -> pd.Series:
    """
    Get rid of leading 0s and make USDOT a string for merging purposes
    """
    s = s.astype("string").str.strip().str.replace(r"^\s*0+(?=\d)", "", regex=True)
    return s.mask(s.isin(["", "nan", "<NA>"]))

def _num(d: pd.DataFrame, col: str) -> pd.Series:
    return pd.to_numeric(d[col], errors="coerce") if col in d.columns else pd.Series(np.nan, index=d.index)

def load_fars_vehicle(files_with_years: List[Tuple[str, int]]) -> pd.DataFrame:
    """
    Load multiple FARS vehicle CSVs and append a YEAR column
    """
    frames = []
    for path, year in files_with_years:
        df = pd.read_csv(path, low_memory=False)
        df.columns = pd.Index([str(c).strip().upper() for c in df.columns])
        df["YEAR"] = int(year)
        frames.append(df)
    return pd.concat(frames, ignore_index=True)

# Determining accident fault
def compute_fault(df: pd.DataFrame, source_label: str = "FARS") -> pd.DataFrame:
    """
    Compute a simple fault score and binary likely-at-fault flag for each vehicle
    Likely-at-fault flag assigned to vehicles with a fault score of at least 2
    Numbered values are taken from the FARS codebook and represent factors that likely indicate fault in the accident
    """
    d = df.copy()
    d.columns = pd.Index([str(c).strip().upper() for c in d.columns])
    d["SOURCE"] = source_label

    acc  = _num(d, "ACC_TYPE")
    pc1  = _num(d, "P_CRASH1")
    pc2  = _num(d, "P_CRASH2")
    pc3  = _num(d, "P_CRASH3")
    pc4  = _num(d, "P_CRASH4")
    pc5  = _num(d, "P_CRASH5")
    spd  = _num(d, "SPEEDREL")
    drink= _num(d, "DR_DRINK")

    #Vehicle coded as having struck something (likely at fault)
    striking_codes = set(range(1,11)) | set(range(20,35)) | {36,38,40,44,45,46,47,64,65} | set(range(68,86)) | {92}

    #Vehicle coded as being struck by something (less likely at fault)
    struck_codes   = {35,37,39,41,55,57,59,61,87,89}

    self_induced   = {6,10,11,12,13,14,18}
    other_encroach = set(range(60,66)) | {66,67,70,71,72,73,74,78}
    risky_move     = {6,13,14,15,16}
    loss_control   = {2,3,4,5,7}
    off_road       = {4,5}
    no_avoid       = {1}

    EV_STRIKING  = acc.isin(striking_codes)
    EV_STRUCK    = acc.isin(struck_codes)
    EV_SELF_HAZ  = pc2.isin(self_induced) | pc4.isin(loss_control) | pc5.isin(off_road) | pc1.isin(risky_move)
    EV_OTHER_HAZ = pc2.isin(other_encroach)
    EV_NO_AVOID  = pc3.isin(no_avoid)
    FAULT_AVOID  = (acc.isin(struck_codes) & pc2.isin(self_induced))
    EV_SPEED     = (~spd.isin([0,8,9]))
    EV_DRINK     = (drink == 1)

    #Weighted scores based on likelihood of direct cause of accident vs incidental causes of accident
    fault_score = (
        3*(EV_STRIKING | EV_SELF_HAZ | FAULT_AVOID).astype(int)
        + 2*EV_NO_AVOID.astype(int)
        - 3*(EV_STRUCK | EV_OTHER_HAZ).astype(int)
        + 2*EV_SPEED.astype(int)
        + 1*EV_DRINK.astype(int)
    )
    d["FAULT_SCORE"] = fault_score
    d["LIKELY_AT_FAULT"] = np.select([fault_score >= 2, fault_score <= -2], [1, 0], default=np.nan)
    return d

def filter_trucking_with_usdot(df: pd.DataFrame,
                               mcarr_flag_col="MCARR_I1", usdot_col="MCARR_I2") -> pd.DataFrame:
    """
    Filters FARS vehicles to those with MCARR_I1 == 57 with a valid USDOT number
    """
    d = df.copy()
    d.columns = pd.Index([str(c).strip().upper() for c in d.columns])
    d[mcarr_flag_col] = pd.to_numeric(d[mcarr_flag_col], errors="coerce")
    d[usdot_col] = _clean_usdot(d[usdot_col])
    return d[(d[mcarr_flag_col] == 57) & d[usdot_col].notna()].copy()

# Merge FARS data to larger trucking census data
def merge_census_with_fars(census_csv: str, vehicles_df: pd.DataFrame,
                           census_col="DOT_NUMBER", veh_usdot_col="MCARR_I2") -> pd.DataFrame:
    """
    Keep ALL census rows and attach any matching FARS rows (zero, one, or many).
    Census is the primary table.
    """
    # Prep FARS
    v = vehicles_df.copy()
    v.columns = pd.Index([str(c).strip().upper() for c in v.columns])
    v[veh_usdot_col] = _clean_usdot(v[veh_usdot_col])

    # Load Census
    census = pd.read_csv(census_csv, low_memory=False)
    census.columns = pd.Index([str(c).strip().upper() for c in census.columns])
    census[census_col] = _clean_usdot(census[census_col])

    # Left join FROM census TO FARS
    merged = census.merge(
        v,
        left_on=census_col,
        right_on=veh_usdot_col,
        how="left",
        validate="1:m",
        suffixes=("", "_FARS"),
        indicator=True
    )

    # Diagnostics
    match_rate = (merged["_merge"] == "both").mean()
    print(f"Census rows: {len(census):,}")
    print(f"Row-level match rate (after expansion by FARS vehicles): {match_rate:.1%}")
    print(f"Census rows with ≥1 FARS match: "
          f"{merged.groupby(census_col)['_merge'].first().eq('both').mean():.1%}")

    merged = merged.drop(columns=["_merge"])
    return merged

# Check correlation on vehicles labelled "1" in FARS with fault scoring

def vehicle1_fault_summary(scored_df: pd.DataFrame) -> None:
    d = scored_df.copy()
    d["VEH_NO"] = pd.to_numeric(d.get("VEH_NO"), errors="coerce")
    d["FAULT_SCORE"] = pd.to_numeric(d.get("FAULT_SCORE"), errors="coerce")
    d["LIKELY_AT_FAULT"] = pd.to_numeric(d.get("LIKELY_AT_FAULT"), errors="coerce")
    d["IsVeh1"] = np.where(d["VEH_NO"] == 1, "Vehicle 1", "Other Vehicles")
    summary = (
        d.groupby("IsVeh1")[["FAULT_SCORE", "LIKELY_AT_FAULT"]]
         .agg(["mean","count"])
    )
    corr = (
        d.assign(IsVeh1_flag=(d["VEH_NO"] == 1).astype(int))[["IsVeh1_flag","FAULT_SCORE"]]
         .corr().iloc[0,1]
    )
    print("\n--- Vehicle 1 vs Others (FULL sample) ---")
    print(summary)
    print(f"\nCorrelation between Vehicle 1 and FAULT_SCORE: {corr:.3f}")

# Pipeline for everything
def run_fars_pipeline(fars_files: List[Tuple[str, int]], census_csv: str,
                      out_scored="fars_scored.parquet",
                      out_trucking="fars_trucking_usdot.parquet",
                      out_merged="census_with_fars.parquet"):
    """
    Full pipeline: load FARS, score, filter for USDOT numbers, merge with census, export
    """
    vehicles_all = load_fars_vehicle(fars_files)
    print(f"Combined FARS vehicle records: {len(vehicles_all):,}")

    vehicles_scored = compute_fault(vehicles_all, source_label="FARS")
    vehicle1_fault_summary(vehicles_scored)

    trucking = filter_trucking_with_usdot(vehicles_scored, "MCARR_I1", "MCARR_I2")
    print(f"\nTrucking subset (MCARR_I1==57 & USDOT present): {len(trucking):,}")
    print(f"Unique DOTs in trucking subset: {trucking['MCARR_I2'].nunique():,}")

    merged = merge_census_with_fars(census_csv, trucking, "DOT_NUMBER", "MCARR_I2")
    print(f"\nMerged census (kept all rows): {len(merged):,} total rows; unique DOTs: {merged['DOT_NUMBER'].nunique():,}")

    # Export as Parquet
    vehicles_scored.to_parquet(out_scored, index=False)
    trucking.to_parquet(out_trucking, index=False)
    merged.to_parquet(out_merged, index=False)

    print(f"\nSaved Parquet files:\n  {out_scored}\n  {out_trucking}\n  {out_merged}")
    return vehicles_scored, trucking, merged


In [63]:
#Enter specific datasets here
fars_files = [
    ("fars_2020_vehicle.csv", 2020),
    ("fars_2021_vehicle.csv", 2021),
    ("fars_2022_vehicle.csv", 2022),
    ("fars_2023_vehicle.csv", 2023)
]

census_csv = "SMS_Input_-_Motor_Carrier_Census_Information_20250919.csv"

fars_scored, fars_trucking, fars_merged = run_fars_pipeline(fars_files, census_csv)

Combined FARS vehicle records: 235,438

--- Vehicle 1 vs Others (FULL sample) ---
               FAULT_SCORE         LIKELY_AT_FAULT        
                      mean   count            mean   count
IsVeh1                                                    
Other Vehicles    0.471990   82862        0.604366   49652
Vehicle 1         2.893201  152576        0.955218  118687

Correlation between Vehicle 1 and FAULT_SCORE: 0.433

Trucking subset (MCARR_I1==57 & USDOT present): 13,712
Unique DOTs in trucking subset: 10,725
Census rows: 2,091,643
Row-level match rate (after expansion by FARS vehicles): 0.5%
Census rows with ≥1 FARS match: 0.4%

Merged census (kept all rows): 2,094,364 total rows; unique DOTs: 2,091,643

Saved Parquet files:
  fars_scored.parquet
  fars_trucking_usdot.parquet
  census_with_fars.parquet
