# Load Data

In [3]:
import polars as pl

# Set your file path here (use .parquet or .csv)
DATA_PATH = "/Users/jareddonohue/Desktop/Capstone/data/transportation_data_20250917_222245.parquet"

# Load Data
df = pl.read_parquet(DATA_PATH)

print(f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns.")
print("Columns:", df.columns)

Loaded dataset with 2091643 rows and 42 columns.
Columns: ['dot_number', 'legal_name', 'dba_name', 'carrier_operation', 'hm_flag', 'pc_flag', 'phy_street', 'phy_city', 'phy_state', 'phy_zip', 'phy_country', 'mailing_street', 'mailing_city', 'mailing_state', 'mailing_zip', 'mailing_country', 'telephone', 'fax', 'email_address', 'mcs150_date', 'mcs150_mileage', 'mcs150_mileage_year', 'add_date', 'oic_state', 'nbr_power_unit', 'driver_total', 'recent_mileage', 'recent_mileage_year', 'vmt_source_id', 'private_only', 'authorized_for_hire', 'exempt_for_hire', 'private_property', 'private_passenger_business', 'private_passenger_nonbusiness', 'migrant', 'us_mail', 'federal_government', 'state_government', 'local_government', 'indian_tribe', 'op_other']


# Summary Statistics

In [4]:
# Numeric + categorical summary + missing values (all-in-one)
print(df.describe())

# Nulls per column
print("Missing values per column:\n", df.null_count())

# (Optional) Most common value for each string column (top 3 shown as example)
for col in df.select(pl.col(pl.Utf8)).columns[:3]:
    print(f"{col} value counts:\n", df[col].value_counts().head())

shape: (9, 43)
┌───────────┬───────────┬───────────┬──────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ statistic ┆ dot_numbe ┆ legal_nam ┆ dba_name ┆ … ┆ state_gov ┆ local_gov ┆ indian_tr ┆ op_other  │
│ ---       ┆ r         ┆ e         ┆ ---      ┆   ┆ ernment   ┆ ernment   ┆ ibe       ┆ ---       │
│ str       ┆ ---       ┆ ---       ┆ str      ┆   ┆ ---       ┆ ---       ┆ ---       ┆ str       │
│           ┆ f64       ┆ str       ┆          ┆   ┆ f64       ┆ f64       ┆ f64       ┆           │
╞═══════════╪═══════════╪═══════════╪══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ count     ┆ 2.091643e ┆ 2091499   ┆ 556551   ┆ … ┆ 2.091643e ┆ 2.091643e ┆ 2.091643e ┆ 2091504   │
│           ┆ 6         ┆           ┆          ┆   ┆ 6         ┆ 6         ┆ 6         ┆           │
│ null_coun ┆ 0.0       ┆ 144       ┆ 1535092  ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ 139       │
│ t         ┆           ┆           ┆          ┆   ┆           ┆           ┆

# Take a Random Sample (N=100)
This will be used to estimate the quality of the dataset. The following margins of error (MoE) are expected based on the sample size (N).
* N=10 --> 
* N=100 --> 
* N=1000 --> 

In [5]:

import random

def sample_rows(data, n=100):
    # Choose relevant columns
    cols = [
        "legal_name", "phy_state", "phy_city", "phy_zip", "email_address",
        "driver_total", "mcs150_mileage", "mcs150_mileage_year"
    ]
    available_cols = [c for c in cols if c in data.columns]
    sample = data.select(available_cols).sample(n=n, with_replacement=False)
    return sample

sample_100 = sample_rows(df, n=100)
print(sample_100)

shape: (100, 8)
┌────────────┬───────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┐
│ legal_name ┆ phy_state ┆ phy_city   ┆ phy_zip    ┆ email_add ┆ driver_to ┆ mcs150_mi ┆ mcs150_mi │
│ ---        ┆ ---       ┆ ---        ┆ ---        ┆ ress      ┆ tal       ┆ leage     ┆ leage_yea │
│ str        ┆ str       ┆ str        ┆ str        ┆ ---       ┆ ---       ┆ ---       ┆ r         │
│            ┆           ┆            ┆            ┆ str       ┆ f64       ┆ f64       ┆ ---       │
│            ┆           ┆            ┆            ┆           ┆           ┆           ┆ f64       │
╞════════════╪═══════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ AARON      ┆ TX        ┆ FORT WORTH ┆ 76123-2582 ┆ AARONMARI ┆ 1.0       ┆ 50000.0   ┆ 2024.0    │
│ HAULING    ┆           ┆            ┆            ┆ N2004@GMA ┆           ┆           ┆           │
│ LLC        ┆           ┆            ┆            ┆ IL.COM    ┆           

# Estimate `data_quality_rate` using Annotated Random Sample

We define the `data_quality_rate = (#good_annotated_entries) / (#total_annotated_entries)` for the following key columns in the dataset:
- legal_name
- phy_state
- phy_city
- phy_zip
- email_address
- driver_total
- mcs150_mileage
- mcs150_mileage_year

To calculate the margin of error for the "data_quality_rate" (a proportion) with sample size N from a population of 2,091,643, we use the standard margin of error formula for proportions, incorporating a finite population correction.

**Margin of Error Formula for Proportions**

$$
\text{MOE} = z \cdot \sqrt{\frac{\hat{p}(1-\hat{p})}{n}} \cdot \sqrt{\frac{N-n}{N-1}}
$$

- $z$ = z-score for confidence level (use 1.96 for 95% confidence)
- $\hat{p}$ = observed sample proportion ("data_quality_rate")
- $n$ = sample size (10, 100, or 1000)
- $N$ = total population size (2,091,643)
- The last term is the finite population correction

We don't know $\hat{p}$ beforehand, so we use $\hat{p} = 0.5$ for a conservative (maximum) margin of error.

We find the following margins of error based on the sample size N.

- **N=10**: Margin of Error ≈ 31.0%
- **N=100**: Margin of Error ≈ 9.8%
- **N=1000**: Margin of Error ≈ 3.1%

In [8]:
import numpy as np

def margin_of_error(n, N, p=0.5, conf=0.95): # update p with sample proportion after doing annotations
    z = 1.96 if conf == 0.95 else 1.645  # for 90%, etc.
    fpc = np.sqrt((N - n) / (N - 1))
    return z * np.sqrt(p * (1 - p) / n) * fpc

N = 2_091_643
for n in [10, 100, 1000]:
    moe = margin_of_error(n, N)
    print(f"Sample size {n}: Margin of Error ≈ {moe*100:.1f}%")

Sample size 10: Margin of Error ≈ 31.0%
Sample size 100: Margin of Error ≈ 9.8%
Sample size 1000: Margin of Error ≈ 3.1%
