In [1]:
import polars as pl
import numpy as np

## Clean Address Data

In [4]:
addrs_df = pl.read_parquet("../data/geocoded_addresses.parquet")

addrs_df.shape

(2089645, 9)

In [5]:
daxle_matches = []
for i in range(0, 2089001, 1000):
    part_df = pl.read_parquet(f"../data/data_axle_matched_addresses/{i}_data.parquet")
    daxle_matches.append(part_df)

daxle_df = pl.concat(daxle_matches).with_columns(
    pl.col("data_axle_row_index")
      .replace(-1, None)
      .alias("data_axle_row_index")
)

In [6]:
daxle_df["confidence"].describe()

statistic,value
str,f64
"""count""",2089645.0
"""null_count""",0.0
"""mean""",45.222436
"""std""",46.28075
"""min""",0.0
"""25%""",0.0
"""50%""",0.0
"""75%""",96.0
"""max""",100.0


In [7]:
daxle_df.write_parquet("../data/data_axle_matched_addresses.parquet")

## Join Matched Addresses back to Data Axle Data

In [3]:
daxle_df = pl.scan_parquet("../data/data_axle_matched_addresses.parquet")

In [4]:
gold = pl.scan_parquet(
    "../data/data-axle.parquet"
).with_row_index(name="data_axle_row_index")


In [5]:
attempted_matches = daxle_df.filter(pl.col("data_axle_row_index").is_not_null())

indices = (
    attempted_matches
    .select("data_axle_row_index")
    .collect()
    .to_series()
)

indices_np = np.array(indices, dtype=int)

# 3. Collect the needed prefix of the big dataset
daxle_subset = (
    gold
    .collect()
    .head(indices_np.max() + 1)
)

# 4. Select rows using NumPy indexing
daxle_subset = daxle_subset[indices_np]

# 5. Combine with small dataset
result = attempted_matches.collect().with_columns(daxle_subset)


In [6]:
result.head()


id,input_address,match_status,match_type,matched_address,tiger_line_id,side,lat,lon,best_match,confidence,data_axle_row_index,parent_number,archive_version_year,abi,ticker,company,address_line_1,city,state,zipcode,zip4,county_code,area_code,idcode,location_employee_size_code,location_sales_volume_code,primary_sic_code,sic6_descriptions,primary_naics_code,naics8_descriptions,sic_code,sic6_descriptions_sic,sic_code_1,sic6_descriptions_sic1,sic_code_2,sic6_descriptions_sic2,sic_code_3,sic6_descriptions_sic3,sic_code_4,sic6_descriptions_sic4,yellow_page_code,business_status_code,industry_specific_first_byte,office_size_code,company_holding_status,subsidiary_number,parent_employee_size_code,parent_sales_volume_code,site_number,address_type_indicator,population_code,census_tract,census_block,match_code,cbsa_code,cbsa_level,csa_code,fips_code,year_established,employee_size_location,sales_volume_location,parent_actual_employee_size,parent_actual_sales_volume,latitude,longitude
str,str,cat,cat,str,str,cat,f64,f64,str,f64,u32,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64
"""1001874""","""450 E 800 N #4, HURRICANE, UT…","""Match""","""Exact""","""450 E 800 N, HURRICANE, UT, 84…","""166175747""","""R""",37.188072,-113.281306,"""766 N 325 E, HURRICANE, UT, 84…",81.818182,15998293,,2023.0,"""786670700""",,"""BRYAN D CHAMBERLAIN""","""766 N 325 E""","""HURRICANE""","""UT""","""84737""","""1777""","""053""","""435""","""2""","""B""",,"""651498""","""OPERATORS OF DWELLINGS OTHER T…","""53111007""","""LESSORS OF RESIDENTIAL BUILDIN…","""999966""","""FEDERAL GOVERNMENT CONTRACTORS""",,,,,,,,,,"""9""",,,,,,,,,"""6""","""270902""","""3""","""P""","""41100""","""2""","""000""","""49053""",,5.0,,,,37.187733,-113.281891
"""1001631""","""712 17TH AVE SOUTH, ST CLOUD, …","""Match""","""Exact""","""712 17TH AVE S, SAINT CLOUD, M…","""107415076""","""L""",45.552923,-94.171028,"""712 17TH AVE S, ST CLOUD, MN, …",95.890411,5399597,,2023.0,"""440425094""",,"""GOOD NEWS ASSEMBLY OF GOD""","""712 17TH AVE S""","""ST CLOUD""","""MN""","""56301""","""4125""","""145""","""320""","""2""","""A""",,"""866107""","""CHURCHES""","""81311008""","""RELIGIOUS ORGANIZATIONS""",,,,,,,,,,,,"""9""","""2""",,,,,,,,"""7""","""000301""","""2""","""P""","""41060""","""2""","""378""","""27145""",,2.0,0.0,,,45.552687,-94.170886
"""1001634""","""5875 GAUTHIER ROAD, NEW FRANKE…","""Match""","""Exact""","""5875 GAUTHIER RD, NEW FRANKEN,…","""148091239""","""L""",44.538088,-87.786913,"""5875 GAUTHIER RD, NEW FRANKEN,…",100.0,5642255,,2023.0,"""466186236""",,"""HAROLD TAUSCHEK & SONS EXCAVTG""","""5875 GAUTHIER RD""","""NEW FRANKEN""","""WI""","""54229""","""9304""","""009""","""920""","""2""","""C""","""C""","""179403""","""EXCAVATING CONTRACTORS""","""23891006""","""SITE PREPARATION CONTRACTORS""","""179407""","""LAND CLEARING & LEVELING""","""179502""","""DEMOLITION CONTRACTORS""","""179952""","""LAKE & POND CONSTRUCTION & MAI…",,,,,"""30002""","""9""",,,,,,,,,"""4""","""020100""","""2""","""P""","""24580""","""2""","""267""","""55009""",,10.0,1590.0,,,44.538971,-87.786863
"""1000309""","""2824 BETHEL ROAD, CHESTER, PA,…","""Match""","""Exact""","""2824 BETHEL RD, CHESTER, PA, 1…","""134889024""","""R""",39.843107,-75.402494,"""2757 BETHEL RD, CHESTER, PA, 1…",91.176471,48620,,2023.0,"""731602518""",,"""ATM""","""2757 BETHEL RD""","""CHESTER""","""PA""","""19013""","""1401""","""045""","""610""","""2""",,,"""602103""","""AUTOMATED TELLER MACHINES""","""52211001""","""COMMERCIAL BANKING""",,,,,,,,,,,,"""9""","""�""",,,,,,,,"""7""","""405300""","""1""","""P""","""37980""","""2""","""428""","""42045""",,0.0,0.0,,,39.842915,-75.401779
"""1000789""","""14516 NEEDHAM DR, JACKSONVILLE…","""Match""","""Exact""","""14516 NEEDHAM DR, JACKSONVILLE…","""655982815""","""R""",30.116927,-81.484954,"""12086 WILLIAMSTOWN DR, JACKSON…",75.862069,13672592,,2023.0,"""750283758""",,"""P1:3 APPAREL""","""12086 WILLIAMSTOWN DR""","""JACKSONVILLE""","""FL""","""32256""","""0869""","""031""","""904""","""2""","""A""","""A""","""569947""","""APPAREL & GARMENTS-RETAIL""","""44819002""","""OTHER CLOTHING STORES""",,,,,,,,,,,,"""9""",,,,,,,,,"""7""","""014412""","""1""","""P""","""27260""","""2""","""300""","""12031""",,4.0,472.0,,,30.116573,-81.485801


In [11]:
result.write_parquet("../data/filtered_data_axle_records_with_dot.parquet")

## Link Back to Census Data

In [50]:
result = pl.read_parquet("../data/filtered_data_axle_records_with_dot.parquet").rename({"id": "DOT_NUMBER"}).with_columns(
    pl.col("DOT_NUMBER").cast(pl.Int64)
)
df = pl.read_parquet(
    "../data/SMS_Input_-_Motor_Carrier_Census_Information_20250919.parquet"
)

In [51]:
df.schema

Schema([('DOT_NUMBER', Int64),
        ('LEGAL_NAME', String),
        ('DBA_NAME', String),
        ('CARRIER_OPERATION', String),
        ('HM_FLAG', Boolean),
        ('PC_FLAG', Boolean),
        ('PHY_STREET', String),
        ('PHY_CITY', String),
        ('PHY_STATE', String),
        ('PHY_ZIP', String),
        ('PHY_COUNTRY', String),
        ('MAILING_STREET', String),
        ('MAILING_CITY', String),
        ('MAILING_STATE', String),
        ('MAILING_ZIP', String),
        ('MAILING_COUNTRY', String),
        ('TELEPHONE', String),
        ('FAX', String),
        ('EMAIL_ADDRESS', String),
        ('MCS150_DATE', String),
        ('MCS150_MILEAGE', Int64),
        ('MCS150_MILEAGE_YEAR', Int64),
        ('ADD_DATE', String),
        ('OIC_STATE', String),
        ('NBR_POWER_UNIT', Int64),
        ('DRIVER_TOTAL', Int64),
        ('RECENT_MILEAGE', Int64),
        ('RECENT_MILEAGE_YEAR', Int64),
        ('VMT_SOURCE_ID', Int64),
        ('PRIVATE_ONLY', Boolean),
      

In [52]:
df = df.join(result, on="DOT_NUMBER", how="left")

In [53]:
from polars_strsim import jaro_winkler


df = df.with_columns(
    pl.when(pl.col("company").is_not_null())
      .then(
          pl.concat_list([
              jaro_winkler("company", "LEGAL_NAME"),
              jaro_winkler("company", "DBA_NAME")
          ])
      )
      .alias("similarities")
).with_columns(
    pl.when(pl.col("similarities").is_not_null())
        .then(
                pl.when(
                    pl.col("similarities").list.get(1).is_null()
                ).then(
                    pl.col("LEGAL_NAME")
                ).otherwise(
                    pl.when(
                        pl.col("similarities").list.get(0) >= pl.col("similarities").list.get(1)
                    ).then(
                        pl.col("LEGAL_NAME")
                    ).otherwise(
                        pl.col("DBA_NAME")
                    )
                )
            ).alias("best_name_match"),
    pl.col("similarities").list.max().alias("best_similarity_name_score")
).sort(pl.col("best_similarity_name_score").fill_null(-1), descending=True)#select(["LEGAL_NAME", "DBA_NAME", "company", "similarities", "best_name_match", "best_similarity_name_score"])
        
    

#df.glimpse()

# df = df.with_columns([
#     pl.col("similarities").list.max().alias("best_similarity"),
#     pl.when(
#         pl.col("company").str.similarity(pl.col("LEGAL_NAME"))
#         >= pl.col("company").str.similarity(pl.col("DBA_NAME"))
#     )
#     .then(pl.col("LEGAL_NAME"))
#     .otherwise(pl.col("DBA_NAME"))
#     .alias("best_name_match")
# ])

In [54]:
df["best_similarity_name_score"].describe()

statistic,value
str,f64
"""count""",1025741.0
"""null_count""",1065902.0
"""mean""",0.610684
"""std""",0.170549
"""min""",0.0
"""25%""",0.507218
"""50%""",0.562121
"""75%""",0.632714
"""max""",1.0


In [55]:
import altair as alt
alt.data_transformers.enable("vegafusion")

chart = (
    alt.Chart(df.filter(pl.col("best_similarity_name_score") >= 0).to_pandas())
    .mark_bar()
    .encode(
        alt.X("best_similarity_name_score:Q", bin=alt.Bin(maxbins=50), title="Best Name Similarity Score"),
        alt.Y("count()", title="Frequency")
    )
    .properties(
        title="Distribution of Company–Name Similarity Scores"
    )
)
chart

In [56]:
matches = df.filter(pl.col("best_similarity_name_score") > .8).sort("best_similarity_name_score")

In [57]:
matches.describe()

statistic,DOT_NUMBER,LEGAL_NAME,DBA_NAME,CARRIER_OPERATION,HM_FLAG,PC_FLAG,PHY_STREET,PHY_CITY,PHY_STATE,PHY_ZIP,PHY_COUNTRY,MAILING_STREET,MAILING_CITY,MAILING_STATE,MAILING_ZIP,MAILING_COUNTRY,TELEPHONE,FAX,EMAIL_ADDRESS,MCS150_DATE,MCS150_MILEAGE,MCS150_MILEAGE_YEAR,ADD_DATE,OIC_STATE,NBR_POWER_UNIT,DRIVER_TOTAL,RECENT_MILEAGE,RECENT_MILEAGE_YEAR,VMT_SOURCE_ID,PRIVATE_ONLY,AUTHORIZED_FOR_HIRE,EXEMPT_FOR_HIRE,PRIVATE_PROPERTY,PRIVATE_PASSENGER_BUSINESS,PRIVATE_PASSENGER_NONBUSINESS,MIGRANT,…,sic6_descriptions_sic,sic_code_1,sic6_descriptions_sic1,sic_code_2,sic6_descriptions_sic2,sic_code_3,sic6_descriptions_sic3,sic_code_4,sic6_descriptions_sic4,yellow_page_code,business_status_code,industry_specific_first_byte,office_size_code,company_holding_status,subsidiary_number,parent_employee_size_code,parent_sales_volume_code,site_number,address_type_indicator,population_code,census_tract,census_block,match_code,cbsa_code,cbsa_level,csa_code,fips_code,year_established,employee_size_location,sales_volume_location,parent_actual_employee_size,parent_actual_sales_volume,latitude,longitude,similarities,best_name_match,best_similarity_name_score
str,f64,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,f64
"""count""",173035.0,"""173035""","""50574""","""173035""",173035.0,173035.0,"""173035""","""173035""","""173035""","""173035""","""173035""","""173033""","""173033""","""173033""","""173033""","""173033""","""172757""","""89765""","""138393""","""162023""",134501.0,131640.0,"""173035""","""173035""",166664.0,172919.0,173035.0,173035.0,91389.0,173035.0,173035.0,173035.0,173035.0,173035.0,173035.0,173035.0,…,"""105434""","""67224""","""67109""","""43253""","""43170""","""28714""","""28677""","""19330""","""19299""","""72929""","""173035""","""5171""","""0""","""79""","""3277""","""5906""","""6602""","""15435""","""719""","""173035""","""173035""","""173035""","""173035""","""173035""","""160677""","""173035""","""173035""",70912.0,173035.0,170250.0,5906.0,6832.0,173035.0,173035.0,173035.0,"""173035""",173035.0
"""null_count""",0.0,"""0""","""122461""","""0""",0.0,0.0,"""0""","""0""","""0""","""0""","""0""","""2""","""2""","""2""","""2""","""2""","""278""","""83270""","""34642""","""11012""",38534.0,41395.0,"""0""","""0""",6371.0,116.0,0.0,0.0,81646.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,"""67601""","""105811""","""105926""","""129782""","""129865""","""144321""","""144358""","""153705""","""153736""","""100106""","""0""","""167864""","""173035""","""172956""","""169758""","""167129""","""166433""","""157600""","""172316""","""0""","""0""","""0""","""0""","""0""","""12358""","""0""","""0""",102123.0,0.0,2785.0,167129.0,166203.0,0.0,0.0,0.0,"""0""",0.0
"""mean""",2163100.0,,,,0.010616,0.02737,,,,,,,,,,,,,,,410452.008119,2023.150068,,,27.629818,8.626166,247909.270205,1068.986234,1.031929,0.592314,0.346849,0.042228,0.660173,0.027815,0.015598,0.00026,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,1983.172411,17.453873,5248.977122,368.069252,4084100.0,38.676906,-92.830763,,,0.951045
"""std""",1106000.0,,,,,,,,,,,,,,,,,,,,25485000.0,133.617337,,,1646.861831,285.839063,13186000.0,1012.489186,0.247251,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,32.308695,73.105108,29781.866437,1203.03973,9288100.0,5.109101,17.368025,,,0.051394
"""min""",44.0,"""'ROUND THE CLOCK TOWING LLC""","""+DAM HALL'S NURSERY PLANTS""","""A""",0.0,0.0,"""# 1 INDUSTRIAL PARK RD""","""1025 SANTIAGO ST""","""AK""","""00603-1405""","""CR""","""# 1 DOBSON LANE""","""10540 WINNETKA AVE N""","""AK""","""00603-1405""","""CA""","""( 20) 325-5150""","""( ) - 0""","""00""","""01-APR-02""",0.0,0.0,"""01-APR-02""","""AK""",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,"""ABDOMINAL SUPPORTS (WHLS)""","""011598""","""ABORTION INFORMATION & SERVICE…","""019101""","""ABORTION INFORMATION & SERVICE…","""075210""","""A T M DEALERS-SERVICE & REPAIR""","""075220""","""ABORTION ALTERNATIVES ORGANIZA…","""00002""","""1""","""!""",,"""1""","""000028704""","""A""","""A""","""000013177""","""N""","""0""","""000100""","""1""","""0""","""00000""","""1""","""000""","""01001""",1633.0,0.0,0.0,1.0,0.0,17.988632,-159.511108,,"""'ROUND THE CLOCK TOWING LLC""",0.8
"""25%""",1260546.0,,,,,,,,,,,,,,,,,,,,8000.0,2021.0,,,1.0,1.0,0.0,0.0,1.0,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,1966.0,3.0,303.0,25.0,117638.0,34.574598,-105.239485,,,0.920261
"""50%""",2186832.0,,,,,,,,,,,,,,,,,,,,25000.0,2023.0,,,2.0,2.0,30.0,2021.0,1.0,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,1987.0,5.0,823.0,100.0,633287.0,39.752763,-87.356346,,,0.962963
"""75%""",2991590.0,,,,,,,,,,,,,,,,,,,,97000.0,2024.0,,,5.0,5.0,42453.0,2024.0,1.0,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,2012.0,15.0,2824.0,350.0,3029097.0,42.177115,-79.366639,,,1.0
"""max""",4460637.0,"""ZZK AUTO SALES LLC""","""ZZ ENTERPRISES""","""C""",1.0,1.0,"""WEST 1427 DEAN""","""ZWOLLE""","""WY""","""99901-9755""","""US""","""WEST 1427 DEAN""","""ZWOLLE""","""WY""","""N0R 1L0""","""US""","""(999) 999-9999""","""(999) 999-9999""","""ZZUPIN@ZUPINCRANE.COM""","""31-OCT-24""",7500100000.0,9900.0,"""31-OCT-98""","""WY""",499995.0,99995.0,4000000000.0,9800.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,…,"""ZOOS""","""999966""","""ZOOS""","""999966""","""ZIPPERS (WHLS)""","""999966""","""YOUTH ORGANIZATIONS & CENTERS""","""999966""","""YARD SIGNS""","""95003""","""9""","""�""",,"""1""","""995070737""","""K""","""K""","""998974554""","""R""","""8""","""989100""","""9""","""X""","""49820""","""2""","""566""","""72131""",2024.0,10000.0,3590883.0,35000.0,99999999.0,64.920831,-65.96403,,"""ZZK AUTO SALES LLC""",1.0


In [58]:
df.shape

(2091643, 110)

In [59]:
df.write_parquet("../data/fully_joined_census_and_data_axle.parquet")

In [60]:
df.filter(pl.col("company").is_not_null()).shape

(1025741, 110)

In [61]:
matches.shape

(173035, 110)

In [62]:
173035 / len(df)


0.08272683244702848

In [65]:
matches.select(["LEGAL_NAME", "DBA_NAME", "company",  "best_similarity_name_score"]).head(9)

LEGAL_NAME,DBA_NAME,company,best_similarity_name_score
str,str,str,f64
"""MASON M LESTER""","""LESTER TRUCKING""","""MARTY LESTER TRUCKING INC""",0.8
"""FRASER CONSTRUCTION""",,"""FRONTIER PRODUCTIONS LLC""",0.800034
"""ELITE LANDSCAPE SERVICES""",,"""PREMIER LANDSCAPE SVC""",0.80007
"""HF JOHNSON TREE FARM LLC""",,"""H F JOHNSON TREE FARM LANDSCPG""",0.800109
"""TRI GROUP CONSTRUCTION & DEVEL…",,"""TRI-GROUP CONSTR & DEVMNT""",0.800111
"""RW HAGGERTY POOL SERVICE INC""",,"""HAGGERTY R W POOL SVC INC""",0.800159
"""BERNARD A SNELL JR""",,"""BERKSHIRE VALLEY DAIRY LLC""",0.800171
"""B&N OILFIELD EQUIPMENT CO INC""",,"""B & N OILFIELD EQPT CO INC""",0.800199
"""ROBERT WILDERMUTH""","""WILDERMUTH FARMS""","""ERIC WILDERMUTH""",0.800201


In [66]:
matches.schema

Schema([('DOT_NUMBER', Int64),
        ('LEGAL_NAME', String),
        ('DBA_NAME', String),
        ('CARRIER_OPERATION', String),
        ('HM_FLAG', Boolean),
        ('PC_FLAG', Boolean),
        ('PHY_STREET', String),
        ('PHY_CITY', String),
        ('PHY_STATE', String),
        ('PHY_ZIP', String),
        ('PHY_COUNTRY', String),
        ('MAILING_STREET', String),
        ('MAILING_CITY', String),
        ('MAILING_STATE', String),
        ('MAILING_ZIP', String),
        ('MAILING_COUNTRY', String),
        ('TELEPHONE', String),
        ('FAX', String),
        ('EMAIL_ADDRESS', String),
        ('MCS150_DATE', String),
        ('MCS150_MILEAGE', Int64),
        ('MCS150_MILEAGE_YEAR', Int64),
        ('ADD_DATE', String),
        ('OIC_STATE', String),
        ('NBR_POWER_UNIT', Int64),
        ('DRIVER_TOTAL', Int64),
        ('RECENT_MILEAGE', Int64),
        ('RECENT_MILEAGE_YEAR', Int64),
        ('VMT_SOURCE_ID', Int64),
        ('PRIVATE_ONLY', Boolean),
      

In [None]:
'location_employee_size_code', String),
        ('location_sales_volume_code', String),
        ('primary_sic_code', String),
        ('sic6_descriptions', String),
        ('primary_naics_code', String),
        ('naics8_descriptions', String),
        ('sic_code', String),
        ('sic6_descriptions_sic', String),
        ('sic_code_1', String),
        ('sic6_descriptions_sic1', String),
        ('sic_code_2', String),
        ('sic6_descriptions_sic2', String),
        ('sic_code_3', String),
        ('sic6_descriptions_sic3', String),
        ('sic_code_4', String),
        ('sic6_descriptions_sic4', String),

In [71]:
matches["naics8_descriptions"].value_counts(normalize=True).sort("proportion", descending=True)

shape: (924, 2)
┌─────────────────────────────────┬────────────┐
│ naics8_descriptions             ┆ proportion │
│ ---                             ┆ ---        │
│ str                             ┆ f64        │
╞═════════════════════════════════╪════════════╡
│ SPECIALIZED FREIGHT (EXC USED … ┆ 0.063675   │
│ LANDSCAPING SERVICES            ┆ 0.059167   │
│ NEW SINGLE-FAMILY HSNG CONSTR … ┆ 0.039767   │
│ SITE PREPARATION CONTRACTORS    ┆ 0.035993   │
│ UNCLASSIFIED ESTABLISHMENTS     ┆ 0.026937   │
│ …                               ┆ …          │
│ PHOSPHATIC FERTILIZER MANUFACT… ┆ 0.000006   │
│ ALL OTHER SUPPORT ACTIVITIES F… ┆ 0.000006   │
│ LESSORS OF NON-FINANCIAL INTAN… ┆ 0.000006   │
│ INTERNATIONAL AFFAIRS           ┆ 0.000006   │
│ COMMUNICATION EQUIPMENT REPAIR… ┆ 0.000006   │
└─────────────────────────────────┴────────────┘


## Analyze Complete Data

In [2]:
df = pl.read_parquet("../data/fully_joined_census_and_data_axle.parquet")

In [4]:
df["match_type"].value_counts(normalize=True)

match_type,proportion
cat,f64
"""Exact""",0.30007
,0.503798
"""Non_Exact""",0.196132


In [6]:
df["lat"].is_null().sum() / len(df)

0.5037982103064433

In [8]:
df["PHY_STREET"].is_null().sum() / len(df)

0.0007874192680108412

In [10]:
rdf = pl.read_parquet("../data/geocoded_addresses.parquet")
len(rdf) - len(df)

-1998

In [11]:
len(df)

2091643