# Historical Census Analysis

Downloading historical census files.

In [14]:
import polars as pl
import requests
import tempfile
import os
from pathlib import Path
import re

pl.Config().set_tbl_rows(15)
    
path = "../data/historical"
parquet_dir = Path(path)

version_re = re.compile(r"version_(\d+)\.parquet$")
files = sorted(parquet_dir.glob("*.parquet"))

## Download the Files

In [15]:
def download_and_convert(url, parquet_file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
        csv_path = tmp.name
        print(f"Downloading CSV to temporary file: {csv_path}")
        response = requests.get(url, stream=True)
        response.raise_for_status()

        for chunk in response.iter_content(chunk_size=8192):
            tmp.write(chunk)

    print("Reading CSV into DataFrame...")
    df = pl.read_csv(csv_path, infer_schema_length=100000)

    print(f"Saving to Parquet: {parquet_file}")
    df.write_parquet(parquet_file)

    os.remove(csv_path)
    print(f"Deleted temporary CSV: {csv_path}")

    print("Conversion complete!")

    

In [16]:

for i in range(78, 90):
    parquet_file = f"{path}/transportation_data_version_{i}.parquet"
    url = f"https://data.transportation.gov/api/archival.csv?id=kjg3-diqy&version={i}&method=export"
    download_and_convert(url, parquet_file)

Downloading CSV to temporary file: /var/folders/7b/ywb7l4_94rs64f3qcjrzgz0m0000gq/T/tmph7ye0lxd.csv
Reading CSV into DataFrame...
Saving to Parquet: ../data/historical/transportation_data_version_78.parquet
Deleted temporary CSV: /var/folders/7b/ywb7l4_94rs64f3qcjrzgz0m0000gq/T/tmph7ye0lxd.csv
Conversion complete!
Downloading CSV to temporary file: /var/folders/7b/ywb7l4_94rs64f3qcjrzgz0m0000gq/T/tmpfxlhz73p.csv
Reading CSV into DataFrame...
Saving to Parquet: ../data/historical/transportation_data_version_79.parquet
Deleted temporary CSV: /var/folders/7b/ywb7l4_94rs64f3qcjrzgz0m0000gq/T/tmpfxlhz73p.csv
Conversion complete!
Downloading CSV to temporary file: /var/folders/7b/ywb7l4_94rs64f3qcjrzgz0m0000gq/T/tmph2cn430k.csv
Reading CSV into DataFrame...
Saving to Parquet: ../data/historical/transportation_data_version_80.parquet
Deleted temporary CSV: /var/folders/7b/ywb7l4_94rs64f3qcjrzgz0m0000gq/T/tmph2cn430k.csv
Conversion complete!
Downloading CSV to temporary file: /var/folders/7b/y

In [17]:

schemas = []
for f in files:
    scan = pl.scan_parquet(f)
    schema = scan.collect_schema()
    schemas.append((f.name, schema))

first_schema = schemas[0][1]
compatible = all(s == first_schema for _, s in schemas)

if compatible:
    print("All Parquet files have the same schema and are concatenable.")
else:
    print("Schema differences found:")
    for name, schema in schemas:
        if schema != first_schema:
            print(f"\n{name}:")
            print(schema)

All Parquet files have the same schema and are concatenable.


In [20]:
files = sorted(parquet_dir.glob("*.parquet"))

In [21]:
import re

version_re = re.compile(r"version_(\d+)\.parquet$")

for file in files:
    match = version_re.search(file.name)
    if not match:
        print(f"Skipping {file.name} (no version number found)")
        continue

    version = int(match.group(1))
    print(f"Adding version={version} to {file.name}")


    lf = pl.scan_parquet(file)
    lf = lf.with_columns(pl.lit(version).alias("version"))

    try:
        df = lf.collect(engine="streaming")
    except Exception as e:
        print(f"Failed to collect {file.name}: {e}")
        continue

    tmp_path = file.with_suffix(".tmp.parquet")
    df.write_parquet(tmp_path, compression="zstd")
    os.replace(tmp_path, file)

print("All files updated in place.")

Adding version=100 to transportation_data_version_100.parquet


  df = lf.collect(streaming=True)


Adding version=101 to transportation_data_version_101.parquet
Adding version=102 to transportation_data_version_102.parquet
Adding version=103 to transportation_data_version_103.parquet
Adding version=104 to transportation_data_version_104.parquet
Adding version=105 to transportation_data_version_105.parquet
Adding version=106 to transportation_data_version_106.parquet
Adding version=107 to transportation_data_version_107.parquet
Adding version=108 to transportation_data_version_108.parquet
Adding version=78 to transportation_data_version_78.parquet
Adding version=79 to transportation_data_version_79.parquet
Adding version=80 to transportation_data_version_80.parquet
Adding version=81 to transportation_data_version_81.parquet
Adding version=82 to transportation_data_version_82.parquet
Adding version=83 to transportation_data_version_83.parquet
Adding version=84 to transportation_data_version_84.parquet
Adding version=85 to transportation_data_version_85.parquet
Adding version=86 to tra

## Analyze the Historical Files

In [22]:
all_lf = pl.concat(
    [pl.scan_parquet(f) for f in files]
)

columns = all_lf.collect_schema().names()


### Count Nulls

In [23]:
records = []

for f in sorted(files):
    match = version_re.search(f.name)
    if not match:
        continue
    version = int(match.group(1))

    lf = pl.scan_parquet(f)
    agg = (
        lf.select([
            pl.len().alias("row_count"),
            pl.col("dot_number").n_unique().alias("unique_companies"),
            *[
                pl.col(c).null_count().alias(f"{c}_nulls")
                for c in columns if c not in ["version", "dot_number"]
            ]
        ])
        .collect()
    )

    df = agg.with_columns(pl.lit(version).alias("version"))
    records.append(df)


df_reliability = pl.concat(records).sort("version")

In [24]:
null_cols = [c for c in df_reliability.columns if c.endswith("_nulls")]

df_nulls_long = (
    df_reliability
    .select(["version", "row_count", *null_cols])
    .unpivot(
        index=["version", "row_count"],
        on=null_cols,
        variable_name="column",
        value_name="null_count"
    )
    .with_columns(
        (pl.col("null_count") / pl.col("row_count")).alias("null_fraction")
    )
    .sort(["column", "version"])
)

In [25]:
column_volatility = (
    df_nulls_long.group_by("column")
    .agg([
        pl.col("null_fraction").mean().alias("mean_null_frac"),
        pl.col("null_fraction").std().alias("std_null_frac"),
        (pl.col("null_fraction").max() - pl.col("null_fraction").min()).alias("range_null_frac")
    ])
    .sort("std_null_frac", descending=True)
)

In [26]:
column_volatility.head(11)

column,mean_null_frac,std_null_frac,range_null_frac
str,f64,f64,f64
"""vmt_source_id_nulls""",0.610336,0.006427,0.022091
"""fax_nulls""",0.74461,0.002466,0.008815
"""email_address_nulls""",0.246611,0.002178,0.007821
"""dba_name_nulls""",0.730946,0.000972,0.003478
"""mcs150_date_nulls""",0.082795,0.00052,0.001871
"""mcs150_mileage_year_nulls""",0.377069,0.000356,0.001509
"""mcs150_mileage_nulls""",0.344789,0.000251,0.001234
"""nbr_power_unit_nulls""",0.038809,0.000247,0.000894
"""telephone_nulls""",0.005602,3.9e-05,0.000243
"""phy_zip_nulls""",0.000815,3.9e-05,0.00023


In [41]:
cols = ["driver_total", "nbr_power_unit"]


(
    df.group_by("vmt_source_id")
    .agg([
        pl.len().alias("n_rows"),
        *[
            pl.col(c).mean().alias(f"{c}_mean") for c in cols
        ],
        *[
            pl.col(c).std().alias(f"{c}_stddev") for c in cols
        ],
        *[
            pl.col(c).quantile(0.05).alias(f"{c}_q05") for c in cols
        ],
        *[
            pl.col(c).quantile(0.50).alias(f"{c}_median") for c in cols
        ],
        *[
            pl.col(c).quantile(0.95).alias(f"{c}_q95") for c in cols
        ],
    ])
    .sort("vmt_source_id")
)

vmt_source_id,n_rows,driver_total_mean,nbr_power_unit_mean,driver_total_stddev,nbr_power_unit_stddev,driver_total_q05,nbr_power_unit_q05,driver_total_median,nbr_power_unit_median,driver_total_q95,nbr_power_unit_q95
i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
,39148600,4.66817,65.459625,385.95425,3668.039869,1.0,1.0,1.0,1.0,6.0,5.0
1.0,24522437,8.064941,7.792971,297.970805,270.925598,1.0,1.0,2.0,2.0,18.0,18.0
2.0,46946,24.175968,23.389831,157.043892,146.48202,1.0,1.0,5.0,5.0,63.0,64.0
3.0,426772,16.705032,17.18417,118.802081,131.310415,1.0,1.0,4.0,4.0,50.0,51.0


### Company Volatility

In [27]:
target_columns = ["dot_number", "version", "add_date", "mcs150_date", "recent_mileage_year", "recent_mileage", "mcs150_mileage_year", "mcs150_mileage", "vmt_source_id", "driver_total", "nbr_power_unit"]

In [27]:
df = pl.read_parquet("../data/historical/transportation_data_version_108.parquet").select(target_columns)

In [28]:
df.with_columns(
    pl.col("add_date").cast(pl.String).str.strptime(
        pl.Date, "%d-%b-%y").alias("add_date"),
    pl.col("mcs150_date").cast(pl.String).str.strptime(
        pl.Date, "%d-%b-%y").alias("mcs150_date")
).head()

dot_number,version,add_date,mcs150_date,recent_mileage_year,recent_mileage,mcs150_mileage_year,mcs150_mileage,vmt_source_id,driver_total,nbr_power_unit
i64,i32,date,date,i64,i64,i64,i64,i64,i64,i64
100065,108,1974-06-01,2010-05-26,0,0,2009,10000,,4,4
1001522,108,2002-01-29,2008-03-12,0,0,2000,20000,,3,4
1001288,108,2002-01-29,2025-06-02,2024,42,2024,42,1.0,2,2
1001691,108,2002-01-30,2025-07-30,2024,20000,2024,20000,1.0,3,2
1000610,108,2002-01-24,2025-04-03,2025,120000,2025,120000,1.0,3,6


In [28]:
lf = all_lf.select(target_columns).with_columns(
    pl.col("add_date").cast(pl.String).str.strptime(
        pl.Date, "%d-%b-%y").alias("add_date"),
    pl.col("mcs150_date").cast(pl.String).str.strptime(
        pl.Date, "%d-%b-%y").alias("mcs150_date"),
    pl.when(
        pl.col("recent_mileage_year") == 0
    ).then(
        pl.lit(None)
    ).otherwise(
        pl.col("recent_mileage_year")
    ).alias("recent_mileage_year"),
    pl.when(
        pl.col("recent_mileage") == 0
    ).then(
        pl.lit(None)
    ).otherwise(
        pl.col("recent_mileage")
    ).alias("recent_mileage")
        
)

In [29]:
df = lf.collect()

In [30]:
df.glimpse()

Rows: 64144755
Columns: 11
$ dot_number           <i64> 1544857, 2606830, 2606831, 2606832, 2607121, 2607122, 2611179, 2611180, 2611181, 2611182
$ version              <i32> 100, 100, 100, 100, 100, 100, 100, 100, 100, 100
$ add_date            <date> 2006-08-21, 2015-04-13, 2015-04-13, 2015-04-13, 2015-04-13, 2015-04-13, 2015-04-17, 2015-04-17, 2015-04-17, 2015-04-17
$ mcs150_date         <date> 2023-08-18, None, None, None, None, None, None, None, None, None
$ recent_mileage_year  <i64> None, None, None, None, None, None, None, None, None, None
$ recent_mileage       <i64> None, None, None, None, None, None, None, None, None, None
$ mcs150_mileage_year  <i64> 2022, None, None, None, None, None, None, None, None, None
$ mcs150_mileage       <i64> 10000, None, None, None, None, None, None, None, None, None
$ vmt_source_id        <i64> None, None, None, None, None, None, None, None, None, None
$ driver_total         <i64> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
$ nbr_power_unit       <i64> 1, 1, 1

In [31]:
num_cols = ["recent_mileage_year", "recent_mileage", "mcs150_mileage_year", "mcs150_mileage", "vmt_source_id", "driver_total", "nbr_power_unit"]

summary = (
    df.group_by("dot_number")
    .agg([
        pl.len().alias("n_rows"),
        *[pl.col(c).is_not_null().sum().alias(f"{c}_nonnull_count") for c in num_cols],
    ])
)

for c in num_cols:
    summary = summary.with_columns([
        (pl.col(f"{c}_nonnull_count") / pl.col("n_rows")).alias(f"{c}_nonnull_frac"),
        pl.when(pl.col(f"{c}_nonnull_count") == 0)
          .then(pl.lit("always_null"))
          .when(pl.col(f"{c}_nonnull_count") == pl.col("n_rows"))
          .then(pl.lit("always_present"))
          .otherwise(pl.lit("intermittent"))
          .alias(f"{c}_status")
    ])

summary.select(["dot_number"] + [f"{c}_status" for c in num_cols]).head(10)

dot_number,recent_mileage_year_status,recent_mileage_status,mcs150_mileage_year_status,mcs150_mileage_status,vmt_source_id_status,driver_total_status,nbr_power_unit_status
i64,str,str,str,str,str,str,str
1741424,"""always_null""","""always_null""","""always_null""","""always_null""","""always_null""","""always_present""","""always_present"""
2804994,"""always_null""","""always_null""","""always_null""","""always_null""","""always_null""","""always_present""","""always_present"""
3427756,"""always_present""","""always_present""","""always_present""","""always_present""","""always_present""","""always_present""","""always_present"""
3650722,"""always_null""","""always_null""","""always_null""","""always_null""","""always_null""","""always_present""","""always_present"""
4356619,"""always_null""","""always_null""","""always_null""","""always_null""","""always_null""","""always_present""","""always_present"""
2605251,"""always_null""","""always_null""","""always_null""","""always_null""","""always_null""","""always_present""","""always_present"""
2777731,"""always_null""","""always_null""","""always_null""","""always_null""","""always_null""","""always_present""","""always_null"""
4447682,"""always_null""","""always_null""","""always_null""","""always_null""","""always_null""","""always_present""","""always_present"""
1005689,"""always_present""","""always_present""","""always_present""","""always_present""","""always_present""","""always_present""","""always_present"""
2648898,"""always_null""","""always_null""","""always_null""","""always_null""","""always_null""","""always_present""","""always_present"""


In [32]:
for c in num_cols:
    print(f"\n{c}")
    print(summary[f"{c}_status"].value_counts(normalize=True))


recent_mileage_year
shape: (3, 2)
┌────────────────────────────┬────────────┐
│ recent_mileage_year_status ┆ proportion │
│ ---                        ┆ ---        │
│ str                        ┆ f64        │
╞════════════════════════════╪════════════╡
│ always_null                ┆ 0.577386   │
│ intermittent               ┆ 0.07003    │
│ always_present             ┆ 0.352584   │
└────────────────────────────┴────────────┘

recent_mileage
shape: (3, 2)
┌───────────────────────┬────────────┐
│ recent_mileage_status ┆ proportion │
│ ---                   ┆ ---        │
│ str                   ┆ f64        │
╞═══════════════════════╪════════════╡
│ always_present        ┆ 0.352584   │
│ always_null           ┆ 0.577386   │
│ intermittent          ┆ 0.07003    │
└───────────────────────┴────────────┘

mcs150_mileage_year
shape: (3, 2)
┌────────────────────────────┬────────────┐
│ mcs150_mileage_year_status ┆ proportion │
│ ---                        ┆ ---        │
│ str                

In [35]:
(
    summary
    .select([f"{c}_status" for c in num_cols])
    .melt(variable_name="column", value_name="status")
    .group_by(["column", "status"])
    .len()
    .with_columns([
        (pl.col("len") / pl.sum("len").over("column")).alias("proportion")
    ])
    .pivot(index="column", columns="status", values="proportion")
    .fill_null(0.0)
    .sort("always_null", descending=True)
)

  .melt(variable_name="column", value_name="status")
  .pivot(index="column", columns="status", values="proportion")


column,intermittent,always_null,always_present
str,f64,f64,f64
"""vmt_source_id_status""",0.07003,0.577387,0.352583
"""recent_mileage_year_status""",0.07003,0.577386,0.352584
"""recent_mileage_status""",0.07003,0.577386,0.352584
"""mcs150_mileage_year_status""",0.024651,0.375511,0.599838
"""mcs150_mileage_status""",0.024612,0.344846,0.630543
"""nbr_power_unit_status""",0.001012,0.03657,0.962418
"""driver_total_status""",0.000104,0.00139,0.998505
