# Predicting NAICS

This notebook uses our small number of companies with NAICS numbers as a training data set to predict the NAICS numbers of other companies.

In [37]:
import polars as pl
import pandas as pd

In [6]:
lf = pl.scan_parquet("../data/fully_joined_census_and_data_axle.parquet")
#lf.collect_schema()

In [31]:
lf = lf.with_columns([
    pl.col("primary_naics_code").cast(str).alias("naics"),
    pl.col("naics8_descriptions").alias("naics_desc")
])

naics = pl.col("naics")

In [32]:
# Null out NAICS for all unlikely matches.

lf = lf.with_columns(
    pl.when(
        pl.col("best_similarity_name_score") <= 0.8 # Rely on close name matches only.
    ).then(
        pl.lit(None)
    ).otherwise(
        naics
    ).alias("naics")
)

In [33]:
lf_census = lf.select([pl.col("^[A-Z].*$"), naics])

In [66]:
df = lf_census.collect()
# Create proxy columns

df = df.with_columns(
    pl.col("PHY_ZIP").str.slice(0,3).alias("ZIP3")
).with_columns(
    pl.col("ZIP3").cast(pl.Categorical)
).with_columns(
    (pl.col("OP_OTHER").is_not_null() & (pl.col("OP_OTHER") != "N"))
    .cast(pl.Int8)  # optional: 0/1
    .alias("has_op_other")
).with_columns([
    # Convert dates
    pl.col("MCS150_DATE").str.strptime(pl.Date, "%d-%b-%y", strict=False).alias("MCS150_DATE_parsed"),
    pl.col("ADD_DATE").str.strptime(pl.Date, "%d-%b-%y", strict=False).alias("ADD_DATE_parsed")
])

In [68]:
df.select("^.*DATE_parsed$").head()

MCS150_DATE_parsed,ADD_DATE_parsed
date,date
2025-01-02,2002-01-22
2024-02-09,2002-01-22
2023-05-02,2002-01-23
2001-08-14,2002-01-23
2024-12-06,2002-01-23


In [64]:
to_drop = [
    "^.*DATE$"
    "^MAILING.*$",
    "^PHY.*$",
    "TELEPHONE",
    "FAX",
    "EMAIL_ADDRESS",
    "OIC_STATE",
    "OP_OTHER"
]   

df = df.drop(to_drop)
df.schema

Schema([('DOT_NUMBER', Int64),
        ('LEGAL_NAME', String),
        ('DBA_NAME', String),
        ('CARRIER_OPERATION', String),
        ('HM_FLAG', Boolean),
        ('PC_FLAG', Boolean),
        ('MCS150_DATE', String),
        ('MCS150_MILEAGE', Int64),
        ('MCS150_MILEAGE_YEAR', Int64),
        ('ADD_DATE', String),
        ('NBR_POWER_UNIT', Int64),
        ('DRIVER_TOTAL', Int64),
        ('RECENT_MILEAGE', Int64),
        ('RECENT_MILEAGE_YEAR', Int64),
        ('VMT_SOURCE_ID', Int64),
        ('PRIVATE_ONLY', Boolean),
        ('AUTHORIZED_FOR_HIRE', Boolean),
        ('EXEMPT_FOR_HIRE', Boolean),
        ('PRIVATE_PROPERTY', Boolean),
        ('PRIVATE_PASSENGER_BUSINESS', Boolean),
        ('PRIVATE_PASSENGER_NONBUSINESS', Boolean),
        ('MIGRANT', Boolean),
        ('US_MAIL', Boolean),
        ('FEDERAL_GOVERNMENT', Boolean),
        ('STATE_GOVERNMENT', Boolean),
        ('LOCAL_GOVERNMENT', Boolean),
        ('INDIAN_TRIBE', Boolean),
        ('naics', String

In [58]:
df["OP_OTHER"].value_counts(sort=True)

OP_OTHER,count
str,u32
"""N""",2032211
"""APPLYING FOR MC""",16650
"""UNKNOWN""",3418
"""FARMER""",2183
"""FARM""",2029
…,…
"""MECHANIC REPAIR""",1
"""PUMP INSTALLER""",1
"""SERVICING PRIVATE CUSTOMER IN …",1
"""RAW FOREST PROD""",1


In [36]:
df_labeled = df.filter(naics.is_not_null())
df_unlabeled = df.filter(naics.is_null())

In [38]:
pdf = df_labeled.to_pandas()

In [39]:
pdf.describe()

Unnamed: 0,DOT_NUMBER,MCS150_MILEAGE,MCS150_MILEAGE_YEAR,NBR_POWER_UNIT,DRIVER_TOTAL,RECENT_MILEAGE,RECENT_MILEAGE_YEAR,VMT_SOURCE_ID
count,183398.0,140225.0,137112.0,176598.0,183260.0,183398.0,183398.0,94781.0
mean,2224060.0,418408.1,2023.266541,29.785541,8.367904,251357.6,1046.059123,1.031968
std,1124792.0,25542360.0,139.664865,1871.005656,278.01177,13658140.0,1013.710152,0.247442
min,44.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1320160.0,7400.0,2021.0,1.0,1.0,0.0,0.0,1.0
50%,2298272.0,25000.0,2023.0,2.0,2.0,1.0,2019.0,1.0
75%,3063944.0,95037.0,2024.0,4.0,4.0,40000.0,2024.0,1.0
max,4460652.0,7500075000.0,9900.0,499995.0,99995.0,4000030000.0,9800.0,3.0


In [40]:
df_labeled.schema

Schema([('DOT_NUMBER', Int64),
        ('LEGAL_NAME', String),
        ('DBA_NAME', String),
        ('CARRIER_OPERATION', String),
        ('HM_FLAG', Boolean),
        ('PC_FLAG', Boolean),
        ('PHY_STREET', String),
        ('PHY_CITY', String),
        ('PHY_STATE', String),
        ('PHY_ZIP', String),
        ('PHY_COUNTRY', String),
        ('MAILING_STREET', String),
        ('MAILING_CITY', String),
        ('MAILING_STATE', String),
        ('MAILING_ZIP', String),
        ('MAILING_COUNTRY', String),
        ('TELEPHONE', String),
        ('FAX', String),
        ('EMAIL_ADDRESS', String),
        ('MCS150_DATE', String),
        ('MCS150_MILEAGE', Int64),
        ('MCS150_MILEAGE_YEAR', Int64),
        ('ADD_DATE', String),
        ('OIC_STATE', String),
        ('NBR_POWER_UNIT', Int64),
        ('DRIVER_TOTAL', Int64),
        ('RECENT_MILEAGE', Int64),
        ('RECENT_MILEAGE_YEAR', Int64),
        ('VMT_SOURCE_ID', Int64),
        ('PRIVATE_ONLY', Boolean),
      

In [41]:
pdf.head()

Unnamed: 0,DOT_NUMBER,LEGAL_NAME,DBA_NAME,CARRIER_OPERATION,HM_FLAG,PC_FLAG,PHY_STREET,PHY_CITY,PHY_STATE,PHY_ZIP,...,PRIVATE_PASSENGER_BUSINESS,PRIVATE_PASSENGER_NONBUSINESS,MIGRANT,US_MAIL,FEDERAL_GOVERNMENT,STATE_GOVERNMENT,LOCAL_GOVERNMENT,INDIAN_TRIBE,OP_OTHER,naics
0,1000021,LEITZA EXCAVATING,,C,False,False,230236 COLONIAL ROAD,WAUSAU,WI,54403,...,False,False,False,False,False,False,False,False,N,23891006
1,1000022,MACHINE TOOL & EQUIPMENT INC,,C,False,False,145536 COUNTY ROAD U,WAUSAU,WI,54401,...,False,False,False,False,False,False,False,False,N,42383045
2,1000096,TULL BROTHERS INC,,C,False,False,66 NEW BRITAIN AVE,ROCKY HILL,CT,6067,...,False,False,False,False,False,False,False,False,N,42371011
3,1000123,JENSEN POURED WALLS INC,,C,False,False,204 WATERLOO STREET,COLUMBUS,WI,53925,...,False,False,False,False,False,False,False,False,N,23811001
4,1000209,PARMAR CARRIERS INC,,A,False,False,4929 SAN PABLO DAM RD,EL SOBRANTE,CA,94803,...,False,False,False,False,False,False,False,False,N,99999004
