In [165]:
import re
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import difflib
from functools import lru_cache

In [167]:
# ----------------------------------------
# 1. Load data
# ----------------------------------------
cargo_df = pd.read_csv("dot_cargo_carried.csv")

assert "dot_number" in cargo_df.columns, "Expected column 'dot_number' not found."
assert "cargo_carried" in cargo_df.columns, "Expected column 'cargo_carried' not found."

# Make sure it's string (NaNs become literal "nan" strings)
cargo_df["cargo_carried"] = cargo_df["cargo_carried"].astype(str)

In [169]:
# ----------------------------------------
# 2. Canonical FMCSA categories (29)
# ----------------------------------------
BASIC_CARGO_CATEGORIES = [
    "General Freight",
    "Household Goods",
    "Metal: sheets, coils, rolls",
    "Motor Vehicles",
    "Drive/Tow away",
    "Logs, Poles, Beams, Lumber",
    "Building Materials",
    "Mobile Homes",
    "Machinery, Large Objects",
    "Fresh Produce",
    "Liquids/Gases",
    "Intermodal Cont.",
    "Passengers",
    "Oilfield Equipment",
    "Livestock",
    "Grain, Feed, Hay",
    "Coal/Coke",
    "Meat",
    "Garbage/Refuse",
    "US Mail",
    "Chemicals",
    "Commodities Dry Bulk",
    "Refrigerated Food",
    "Beverages",
    "Paper Products",
    "Utilities",
    "Agricultural/Farm Supplies",
    "Construction",
    "Water Well",
]

In [171]:
# ----------------------------------------
# 3. Normalization helpers
# ----------------------------------------
def normalize_text(s: str) -> str:
    """
    Lowercase, remove most punctuation, collapse whitespace.
    This gives us a stable 'normalized token' for matching.
    """
    s = s.lower()
    # Replace non-alphanumeric with spaces
    s = re.sub(r"[^a-z0-9]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


# Map normalized category -> canonical label (single source of truth)
CATEGORY_NORM_MAP = {
    normalize_text(c): c for c in BASIC_CARGO_CATEGORIES
}

CATEGORY_NORM_LIST = list(CATEGORY_NORM_MAP.keys())


In [173]:
# ----------------------------------------
# 4. Tokenize cargo_carried into one row per (DOT, token)
# ----------------------------------------
# Split on comma, semicolon, colon, ampersand, or slash
SPLIT_PATTERN = r"[;,:/&]+"

rows = []

for _, row in cargo_df[["dot_number", "cargo_carried"]].dropna().iterrows():
    dot = row["dot_number"]
    raw = str(row["cargo_carried"])

    # Split into parts
    parts = [p.strip() for p in re.split(SPLIT_PATTERN, raw) if p.strip()]
    for p in parts:
        rows.append(
            {
                "dot_number": dot,
                "raw_token": p,
                "norm_token": normalize_text(p),
            }
        )

tokens_df = pd.DataFrame(rows)

print("Tokenized rows:", len(tokens_df))
print("Unique normalized tokens:", tokens_df["norm_token"].nunique())

Tokenized rows: 6169287
Unique normalized tokens: 72354


In [475]:
# ==========================================================
# CATEGORY ASSIGNMENT BLOCK (FMCSA + MANUAL EXACT + KEYWORD + FUZZY)
# ==========================================================

# ------------------------------------------------------------------
# 0. FMCSA category normalization map (assumes BASIC_CARGO_CATEGORIES exists)
# ------------------------------------------------------------------
CATEGORY_NORM_MAP = {normalize_text(c): c for c in BASIC_CARGO_CATEGORIES}
CATEGORY_NORM_LIST = list(CATEGORY_NORM_MAP.keys())

# ------------------------------------------------------------------
# 1. Basic FMCSA category normalization / assignment
# ------------------------------------------------------------------

# Words too generic to drive FMCSA word-overlap matching
CATEGORY_OVERLAP_STOPWORDS = {
    "supplies", "supply", "goods", "material", "materials", "equipment", "equip","products", "product", "items", "item", "service", 
    "services", "other","general", "misc", "miscellaneous", "own", "mobile", "home", "homes", "house", "houses","water", "waters",
}

def fuzzy_basic_category(norm_tok: str, cutoff: float = 0.90):
    """
    Fuzzy match a normalized token against normalized FMCSA category names.
    This mainly helps when the token is *almost* exactly a category label
    (e.g. 'refridgerated food' vs 'refrigerated food').
    """
    if not isinstance(norm_tok, str) or not norm_tok.strip():
        return np.nan

    matches = difflib.get_close_matches(
        norm_tok,
        CATEGORY_NORM_LIST,
        n=1,
        cutoff=cutoff,
    )
    if matches:
        best = matches[0]
        return CATEGORY_NORM_MAP.get(best, np.nan)

    return np.nan


def assign_basic_category(norm_tok: str):
    """
    FMCSA basic categorization.
    Strategy:
      1) Exact match on normalized FMCSA category name
      2) Word-level overlap on meaningful words only
         (ignores generic terms like 'supplies', 'materials', etc.)
      3) Fuzzy match against FMCSA category names (for near-exact labels)
    """
    if not isinstance(norm_tok, str) or pd.isna(norm_tok):
        return np.nan

    # 1) Exact match
    if norm_tok in CATEGORY_NORM_MAP:
        return CATEGORY_NORM_MAP[norm_tok]

    # 2) Word-overlap match — but ignore overlap stopwords
    tok_words = {
        w for w in norm_tok.split()
        if w and w not in CATEGORY_OVERLAP_STOPWORDS
    }

    for norm_cat, canon_cat in CATEGORY_NORM_MAP.items():
        cat_words = {
            w for w in norm_cat.split()
            if w and w not in CATEGORY_OVERLAP_STOPWORDS
        }
        if tok_words & cat_words:
            return canon_cat

    # 3) Fuzzy: token is almost exactly a category label
    fuzzy_cat = fuzzy_basic_category(norm_tok)
    if pd.notna(fuzzy_cat):
        return fuzzy_cat

    return np.nan


# ------------------------------------------------------------------
# 2. Manual EXACT category assignments (YOUR LIST)
# ------------------------------------------------------------------
MANUAL_CATEGORY_MAP = {

    "Agricultural/Farm Supplies": [
        "fertilizer","cotton","cotton bales","cotton modules","chicken litter","manure","farmer","agriculture","seed","seeds",
        "seed corn","crops","cow manure","horse manure","sugarcane","sugar cane","fert","fertlizer","seed and dry fertilizer",
        "seed and fertilizer","corn","potatoes","peanuts","rice","ag products","agriculture products","agriculture","grass seed",
        "nursery products","live plants","plant material","plant materials","live trees","fertilizer non hazardous",
        "fertalizer non hazardous","non haz fertilizer","non hm fertilizer","fertilizer and feed","fertilizer and seeds",
        "farming", "farming equipment", "farming operation","forage","implements", "tractors and implements","harvest", 
        "harvest equipment", "harvesting equipment","agri products", "ag prod","agric", "agricultral", "agrigultural", "agruicultral", 
        "aggricultural","farm comodities","farm commodities","organic fertilizer","organic fertilizers","fertilizer ag lime",
        "fertilizer and lime","cotton agriculture","cotton dry agriculture","feed and fertilizer","feed and ferterlizer","farm material",
        "commercial fertilizer","seed beans","cotton se",
    ],

    "Beverages": [
        "beer","wine","liquor","spirits","soda","coffee","alcohol","juice","beverage equipment","wines", "hops", "malt", "apple cider", 
        "cider","soft drinks", "drinks",
    ],

    "Building Materials": [
        "asphalt", "concrete", "pipe", "signs", "cement", "brick", "bricks","block", "pavers", "windows", "doors", "garage doors", 
        "roofing","roofing material", "trusses", "scaffolding", "scaffold", "ladders","countertops", "counter tops", "windows and doors",
        "plywood", "piping","tile", "tiles", "ceramic tile", "vinyl", "lighting fixtures", "granite marble stone", "granite marble slate"
        "floor covering", "fixtures", "awnings", "granite", "granite slabs","granite monuments", "marble", "insulation", "foam insulation",
        "drywall","masonry", "plaster", "panels", "culverts", "pvc", "adhesives", "precast","flooring", "flooring materials",
        "flooring material", "shingles","sheetrock", "sheet rock", "masonry supplies", "concrete forms","concrete products",
        "pre cast concrete", "ready mix concrete", "ready mix","fencing", "fence", "fence material", "fence materials", "fencing material",
        "fencing materials", "natural stone","siding", "gutters", "roofing supplies", "concrete tools", "hvac equipment",
        "electrical equipment", "concrete pump", "concrete pumping", "pipes","plumbing supplies", "plumbing materials", 
        "plumbing material", "glass","masonry materials", "blocks", "cement powder", "cement mixer","concrete mix", "concrete blocks", 
        "concrete equipment","concrete supplies", "roof trusses", "wood trusses", "flooring supplies","wood flooring", "wood cabinets",
        "cabinet", "pipe fittings","sign materials", "gutter materials", "insulation material","insulation materials", "foam products", 
        "spray foam", "redi mix concrete","redi mix", "powder cement", "powdered cement","burial vaults", "vaults", "monuments", 
        "headstones","tombstones", "cemetery monuments", "granite memorials","electrical supplies", "electrical supply", 
        "electrical materials","electrical material", "electrical parts", "electrical tools","electrical products", "electrical components",
        "electrical contractor","wire", "wires", "wire rope", "cable", "cables", "fiber optic cable","fiber optics", "conduit", "millwork",
        "hvac supplies", "hvac materials", "hvac parts", "hvac products","heating supplies", "heating materials", "hvac tools", 
        "heating equipment","plumbing", "plumbing equipment", "plumbing parts", "plumbing materials","plumbing supply", 
        "plumbing and heating", "plumbing and heating supplies","irrigation supplies", "irrigation equipment", "irrigation parts",
        "irrigation materials", "irrigation pipe","mortar", "cement bags", "bagged cement", "concrete work","concrete cutting equipment", 
        "concrete barriers", "concrete structures","concrete walls", "concrete delivery", "wall forms", "foundation forms","forms", 
        "steel molds", "steel tubing", "steel tanks", "aluminum forms","aluminum castings", "aluminum extrusions", "fiberglass pools",
        "fiberglass tanks","overhead doors", "seamless gutters", "railings", "stairs", "shower doors","sheds", "shed", "portable sheds", 
        "portable storage sheds","portable storage buildings", "portable buildings", "storage buildings","storage barns", "storage units", 
        "barns", "mini barns","structures", "wooden structures","plastic pipes","plastic pipe","quarry stone", "flagstone", "sandstone", 
        "slabs", "quartz slabs","cinders", "rip rap", "road mats", "guardrail", "railroad supplies","railroad equipment", "rail",
        "commercial signs", "signage", "electric signs", "signs and tools","conveyors","paint", "paint supplies", "paint materials", 
        "paint equipment", "paints","latex paint", "barkdust", "hardscape", "hardscape materials", "baserock","bituminous", 
        "contruction material", "constr equip","hardware", "hardware supplies", "hardware supply","fasteners", "bolts", "screws", "nails",
        "gates","door hardware", "door frames", "window frames","valves", "flanges", "fittings","ductwork", "duct","filters","radiators",
        "boilers", "boiler","furnaces", "furnace","heaters", "stoves","ac units", "ac equipment", "ac equip", "ac parts", "ac supplies",
        "lights", "led lights","light equip", "light equipment", "light towers","slate", "fieldstone", "bluestone","hardwood", 
        "hardwood floors","decking", "moulding", "moldings", "trim","plastering", "stucco","veneer", "fireplaces", "fireplace","form boards",
        "grout","caulk", "caulking","waterproofing", "waterproofing supply", "waterproofing products","sealants","skylights",
        "thermoplastic","cold mix","flexbase", "wood pallet","wood pallets","wood palets", "heat equipment", "heating equipment", 
        "heaters equipment"
    ],

    "Chemicals": [
        "explosives","lithium batteries","fireworks","pesticides","asbestos","hazmat","deleterious substances","hazardous waste",
        "hazardous materials","batteries","haz mat", "herbicide", "herbicides","pest control", "pest control supplies", 
        "pest control product","insecticide", "insecticides","corrosives", "corrosive material","methanol","chlorine","ammonia",
        "sulfuric acid","chloropicrin","sulfuryl fluoride","miscellaneous hazard materials",
    ],

    "Coal/Coke": [
        "coal","coke","coal/coke","coal coke","coal and coke","coal dust","coal fines","coal ash","coal slack","coal briquettes",
        "coking coal","steam coal","anthracite","bituminous coal","lignite","petcoke","petroleum coke",
    ],

    "Commodities Dry Bulk": [
        "gravel","dirt","sand","rock","stone","sand and gravel","aggregate","aggregates","dirt sand gravel","dirt sand and gravel",
        "sand gravel","rocks","salt","wood chips","woodchips", "broken cement", "broken cements""wood chip","woodchip","sawdust",
        "saw dust","wood shavings","wood pellets","clay","earth","millings","ash","aggeratematerial","aggergate material",
        "aggerate materials","topsiol","topsioil","fly ash","flyash","shale","slag","ag lime","lime","peat moss","chert","boulders",
        "graval","grav","and grav","aggragate","aggregrate","aggrigates","agregate","agregates","aggergate","aggrigate","gypsum",
        "aggregat","agg","aggerate","aggreg","topsoil","fill dirt","fill","caliche","base","broken concrete","crushed concrete",
        "crushed stone","crushed rock","dirt products","earth materials","earth products","rock products","stone products","road salt",
        "limerock","aggragates","gravel products","asphalt gravel","gravel asphalt","gravel and asphalt","dirt asphalt","dirt and asphalt",
        "and dirt","and sand","dirt sand gravel","dirt sand and gravel","dirt sand gravel1","dirt sand gravrel","dirt san gravel",
        "dirt san garvel","gravel and sand","gravel and san","gravel and asphalt","gravel asphalt","gravel products","gravel black top san",
        "gravel blacktop sand","sand rock gravel mulch","sand rock gravel mud","limestone","crushed limestone","gravel materials",
        "gravel etc","sand etc","dirt etc","other dirt","dirty dirt","field dirt","clean fill","fill material","decorative rock","scoria",
        "ore", "minerals", "mineral","grit", "muck","chat","earthen materials", "earthen material","organic material", "organic materials",
        "backfill","residue", "residual","silt","sandrock", "wood chips","wood chiips","woodd chips","woo chips","rocks and soil", 
        "fly ash powder", "flyash powder"
    ],

    "Construction": [
        "construction","construction material", "construction materials","construction supply", "construction supplies","construction tools",
        "construction machinery","construction services","construction equipment","constuction equipment","constr equip","const equip",
        "constuction","constuction equip","contracting equipment","contracting materials","contracting","contracting tools",
        "constuction material","constuction materials","constuction tools","constuction jobs","construction jobs","construction project",
        "construction projects","constuction projects","constuction project","contracting jobs","construction company",
        "contracting company", "const tools", "constr tools","const material", "constr material","construction aggregates", 
        "contruction aggregates","construction aggreates","related const materials","related const materials dirt",
    ],
    
    "Drive/Tow away": [
        "drive away","towaway","mechanic truck","towing","mechanic truck","mechanic service truck","mechanic service trk",
    ],

    "Fresh Produce": [
        "fresh produce","produce","fresh fruits","fresh vegetables","fruit and vegetables","fruits and vegetables","fruit and veg",
        "fruit & veg","veg", "veggies","fruit", "fruits","vegetable", "vegetables","apples","citrus","grapes","blueberries","berries",
        "strawberries","raspberries","blackberries","cherries","cranberries","melons","watermelon","cantaloupe","honeydew","oranges",
        "lemons","limes","peaches","pears","plums","avocados","olives","onion", "onions","tomatoes","mushrooms","fresh potatoes",
        "fresh corn","fresh beans","fresh peas","fresh fruit","fresh fruits","fresh vegetable","fresh vegetables","fresh berries",
        "farm fresh produce",
    ],

    "Garbage/Refuse": [
        "trash","debris","junk","junk cars","debri","dumpsters","salvage","rubble","junk removal","dump truck","rubbish","biosolids",
        "bio solids","sludge","dead animals","roofing debris","demolition","demolition debris","roofing debris","green waste",
        "yard debris","landscaping debris","landscape debris","wood debris","grass clippings","storm debris","solid waste","waste",
        "non hazardous waste","waste tires","human waste","non haz mat waste","non hazmat waste", "msw","landfill","debree", "devree",
        "refuge","roll off containers", "roll off container","rolloff containers","roll off dumpster","dumpster rental",
        "construction debris","constructoni debris","constraction debris","construction debis","construct debris","construc debri",
        "clean up debris","clean up debri","cleanup debris","construction debris","const debris","const debres","constr debris",
        "constr debri","own const debris","own const debri","debris box","debri box","construction rubble","costruction rubble",
        "construction trash","contruction trash",
    ],

    "Grain, Feed, Hay": [
        "silage","straw","milo","alfalfa","wheat straw","barley","soybean meal","soy","grains","ddg", "ddg s","cottonseed","peanut hulls",
        "sorghum","cereal", "liquid feed","lquid feed","liquid fee","liquid feeds",
    ],

    "Household Goods": [
        "furniture","appliances","cabinets","mattresses","mattress","chairs","tables", "swimming pool coping","linen", "outdoor furance",
        "linens","clothes","used clothing","luggage","pool tables","pianos", "custom cabinets", "customer cabinets","home furnishings",
        "rugs","mats", "electronic supply","pet supplies", "pet products", "pet foods", "pet", "pets","home decor", "decor",
        "home improvements", "home improvement","housewares", "houseware","shopping bags","beauty supplies", "beauty products", 
        "beauty supply", "beauty aids","jewelry","cosmetics","wreaths", "christmas wreaths", "christmas decorations", "christmas decor",
        "lampshades","shelving","dehumidifiers","appliance","cabinetry","kitchens","kitchens baths","kitchenware",
        "kitchenware and cookware","kitchenware cutlery","custom cabinetry","carpet","pools","hot tubs","office furniture",
        "new furniture","office supplies","office equipment","electronics","electronic equipment","computers","computer equipment","copiers",
        "kitchen cabinets","bedding","clothing","apparel","garments","uniforms","scrubs","shoes","textiles","toys","musical instruments",
        "art","artwork","antiques","store fixtures","safes","office supplies and products","office supplies and food products",
        "furniture delivery","retail furniture","antique furniture","patio furniture","outdoor furniture","home goods","office products",
        "office","bicycles","sporting goods","cosmetics","video games","tv","fine art","art work","personal items","personal belongings",
        "laundry","laundry equipment","laundry supplies","soap","soaps","detergent","air fresheners","toiletries","lamps", "light bulbs", 
        "bulbs","refrigerator","refrigerator parts a","refrigerator products","refrigerator repair","refrigerator trailer and",
        "refrigerator trailers","refrigerator units",
    ],

    "Intermodal Cont.": [
        "intermodal","intermodal cont","intermodal container","intermodal containers","intermodal freight","intermodal cargo",
        "containerized freight","containerized cargo","shipping container","shipping containers","sea container","sea containers",
        "ocean container","ocean containers","rail container","rail containers","iso container","iso containers",
    ],
    
    "Liquids/Gases": [
        "sewage","septic waste","septage","gas","petroleum","lubricants","sewer","oxygen","fuel oil","diesel","oil","crude oil",
        "heating oil","used oil","home heating oil","home heating fuel","diesel fuel","propane","kerosene","2 fuel oil","2 heating oil",
        "fuel","liquid","liquid manure","septic","septic tanks","septic tank","wastewater","waste water","saltwater","brine","gasoline",
        "liquid fertilizer","liquid asphalt","grease","petroleum products","non hazardous liquids","non hazardous liqui",
        "waste water sewage","wastewater sewage","saltwater exchange tanks","water exchange tanks","raw sewage","jet fuel","biodiesel",
        "liquid sugar","antifreeze", "acetylene", "nitrogen", "helium", "argon","carbon dioxide", "co2","leachate", "effluent", "slurry", 
        "condensate","lpg", "lubricating oils","refrigerant", "refrigerants","coolant", "coolants","edible oils","li","wastewater",
        "waste water","waste water sewage","wastewater sewage","non haz waste water","non haz wastewater","non hazardous waste water",
        "non hazardous wastewater","non hazmat waste water","non hazmat wastewater","gas can","gas cans","industrial water", "liquid fuels",
        "liquid fuel","liquid oils","liquid oil",
    ],
        
    "Livestock": [
        "horses","show horses","cattle","live poultry","live fish","live bait","bees","honey bees","beehives","dogs","poultry","chickens",
        "bee hives","race horses","animals","animal by products","animal fat","dead stock","live seafood","live chicken",
        "bees and equipment","horse trailors",
    ],
    
    "Logs, Poles, Beams, Lumber": [
        "timber","pulpwood","pulp wood","logging","firewood","fire wood","wood","wood products","forest products","raw forest products",
        "logging equipment","forestry products","processed wood","wood by products","railroad ties","wood residuals","wood byproducts",
        "wood bi products", "finished wood prod","finished wood products","laminated wood prods","laminated wood products",
        "wood by products","wood by product","woodchips fire wood","wood chips firewood","forrest products",
    ],

    "Machinery, Large Objects": [
        "boom truck","dozer","skid loader","skidsteer","back hoe","amusement rides","carnival rides","transformers","welding machine",
        "gutter machine","fork lifts","fork lift","generator","generators","crane","cranes","bobcat","bob cat","tractor","excavator",
        "mini excavator","heavy equipment","excavating equipment","logging equipment","paving equipment","welding equipment",
        "film equipment", "farm machinery", "farm machinary""film production equipment","motion picture equipment","movie equipment",
        "music equipment","musical equipment","band equipment","production equipment","fitness equipment","carnival equipment",
        "vending machines","crane truck","crane service","skid steer","forklift","forklifts","excavation equipment","self propelled crane",
        "truck crane","boom","bulldozer","backhoes","loader","scissor lift","lifts","grading equipment","grading of land",
        "training equipment","construction equipment","const equipment","const equip","heavy equip","playground equipment", "playground", 
        "stage equipment","staging equipment", "circus equipment", "entertainment equipment","theatrical equipment", "dj equipment", 
        "concert equipment","audio visual equipment", "audio visual equip", "audio visual equipme","av equipment", "visual equipment", 
        "visual equip", "visual equipme","lighting equipment", "stage", "stages","gym equipment", "gymnastics equipment", 
        "athletic equipment","sports equipment","industrial equipment", "industrial supplies", "industrial parts","industrial pumps",
        "piledriving equipment", "pile driving equipment","rigging equipment", "rigging","material handling equipment",
        "sandblasting equipment", "sandblasting equip","boring equipment", "hydrovac", "hydraulic equipment","mining equipment",
        "mining equip", "mining supplies","mining parts", "mining materials", "mining","trencher", "trenchers", "trenching equipment",
        "ditch witch","compactors","crusher","welding equipment","wielding equipment","pulling tractor", "pulling tractors",
        "compact tractors","tractors and equipment","conveyor belts", "conveyor belt", "conveyor systems","striping equipment",
        "recording equipment","broadcast equipment", "broadcasting equipment","diagnostic equipment","machined components",
        "machine components","machinery components","paint machines","painting machines","construction equiptment","construction eqiupment",
        "const equip","cont equip","milling equipment","milling equiopment","paving equipment","paving equiptment",
    ],

    "Meat": [
        "meat", "crawfish", "lobster","shellfish", "oysters", "chicken","meats","seafood","crabs","shrimp", "turkeys", "beef", "pork",
        "sausage","catfish", "clams", "minnows", "sea foods", "frozen chicken","fish",
    ],
    
    "Metal: sheets, coils, rolls": [
        "steel","fabricated steel","structural steel","steel parts","rebar","metals","iron","copper","aluminum","steel pipe","steel pipes",
        "steel products","steel plates","steel bars","steel drums","steel fabrication","steel fabrications","flat steel","stainless steel",
        "sheetmetal",
    ],

    "Mobile Homes": [
        "mobile home", "mobile homes","mobe homes", "moble homes", "mobil homes", "moble homes","mobile homes1", "mobile homesw",
        "used mobile home","manufactured home", "manufactured homes","manufactored homes", "manufacterd homes","modular home", 
        "modular homes","modular container homes","modular homes and components","modular homes and ma","premanufactured modular homes",
        "modulaire homes","park model homes","tiny home", "tiny homes","cabins small tiny homes","portable homes","manufactured houses",
        "log homes","wood frame homes","wooden frame homes","woodframe homes","whole framed homes","spec built wood mobile homes",
        "oversized loads primarly wood frame homes","rv s mobile homes","sheds and mobile homes","matl s for mobile home set up",
        "mobile home axles","mobile home axles and tires","mobile home fixtures","mobile home install","mobile home installation",
        "mobile home material","mobile home office buildings","mobile home park maintenance","mobile home parts","mobile home repair tools",
        "mobile home setup equipment","mobile home setup material","mobile home steps","mobile home supp","mobile home supplies",
        "mobile home toungs","mobile home trailers","mobile home transport","mobile home type chassis","move mobile office units",
        "transport manufactured homes","schools modular homes","structural homes","structural moving of homes and buildings",
        "modular buildings", "small buildings",
    ],

    "Motor Vehicles": [
        "auto parts","tires","automotive parts","autoparts","auto","automotive","automobiles","cars","autos","truck","trucks","truck parts",
        "car parts","race car","race cars","campers","camper","rv","rv s","rvs","recreational vehicle","travel trailers","boat trailers",
        "boats","car hauler","car carrier","motorcycles","golf carts","golf cars","atv","atv s","atvs","watercraft","trailers","trailer",
        "empty trailers","cargo trailers","utility trailers","vehicle parts","used tires","waste tires","aircraft parts","boat","wheels",
        "wrecker","wrecker service","semi trailers","tire","snowmobiles","automobile parts","rv trailers","camper trailers","truck bodies",
        "truck beds","carports","rv parts","auto transport","auto batteries","automotive supplies","motorcycle parts","tire service",
        "truck repair","car hauling","charter boat","charterboat","camp trailers", "camping trailers", "camping trailer","camping trailers",
        "motorhomes","motorized equipment","houseboats","yachts","pontoons","enclosed trailers","enclosed trailer","restroom trailers",
        "restroom trailer","mri trailer","camping trailers","truck caps","camp trailers","own motorcycles","show motorcycles",
    ],
    
    "Oilfield Equipment": [
        "drill rig","drilling rig","frac tanks","barite","bentonite","oil field equipment","drilling equipment","frac sand",
        "oil field produced water","oilfield produced water","oilfieled produced water","oil feld produced water",
        "oil field salt water","oilfield saltwater","drilling mud","drilling fluids","drilling supplies","drilling equip","hydrocarbons",
        "oil products","oil based mud","milling equiopment","milling equipment", "oilfield materials","oil field water", 
        "oil field waters", "oilfield water",
    ],
    
    "Paper Products": [
        "cardboard","boxes","corrugated boxes","newspapers","books","printed material","newspaper","magazines","documents",
        "printed matter","packaging materials","packaging material","packaging supplies","packaging","packaging products",
    ],

    "Passengers": [
        "school students","students","school bus","school children","school furniture","school supplies","private travel",
        "private travelers",
    ],

     "Refrigerated Food": [
        "milk","seafood","ice","butter","eggs","dairy","ice cream","raw milk","frozen foods","frozen","cream","dairy products",
        "milk products","bagged ice","prepared foods", "liquid dairy","yogurt","egg products", "egg","perishable goods", "perishables",
    ],

    "Utilities": [
        "utility","bucket truck","service truck","service trucks","utility equipment","utility truck","internet","internet services",
    ],

    "US Mail": [
        "packages","amazon","amazon packages","small packages","amazon products","amazon goods","amazon loads","amazon parcels",
        "amazon trailers","amazon relay","parcels","package delivery",
    ],

    # =======================================================
    # NEW CUSTOM CATEGORIES (beyond the 29 FMCSA ones)
    # =======================================================

    "Landscaping/Lawn Care": [
        "landscaping","landscape","landscaping material","landscape material","landscaping tools","landscape tools","landscaping equip", 
        "landscpae materials and plants""landscaping equipmen","landscaping equi","landscaping mate","landscape equip","landscape machines",
        "landscape plants","landscape trees","landscape waste","landscape supply","landscape supplys","landscaper","lawn care","lawncare",
        "lawn service","sunflowers","lawn maintenance","lawn maintenance tools","lawn mowing","lawn mowers","lawn mower","lawn equip",
        "lawn equiptment","lawn tractors","lawn movers","lawn and landscape","yard waste","plants","flowers","cut flowers",
        "bedding plants","nursery","nursery stock","nursery plants","trees","trees and shrubs","trees and plants","plants and trees",
        "shrub", "plants and flowers", "plant and flower""shrubs","bushes","horticulture","gardening","grass","grass sod","sod","sod grass",
        "turf","turf grass","mulch","bagged mulch","bark","bark mulch","wood mulch","pine straw","pinestraw","tree service","tree removal",
        "tree trimming","tree trimmings","tree work","tree branches","tree brush","tree limbs","tree stumps","tree waste","tree debri",
        "landsacping tre","landscap mach","landscapinghorticulturall","landscapingproductasplantesmulchsoiletc",
        "landscpaing equipment and supplies","landscpaing equipment only","landscping equipment and materials","landskeenping supply",
        "landskeeping","landspcaping emq","landyards","leaf blow","leaf blower","leaf blowers","lanedscaping equipment for servic",
        "lanscape equipment and materials","lanscape equipment and supplies","lanscape equipment and tools",
        "lanscape material and equipment","lanscape materials and","lanscape truck and trailer","lansccaping material and equipment",
        "lansdcaping material and tools","land clearing","tree spade","landscaping material sod","landscaping machines",
        "landscape maintenance","landscape trees","soil","soils","loam","erosion control", "erosion control materials",
        "erosion control products", "erosion supplies","mulching", "mulsh", "molch","woodship","landcare equipment","ferns","perennials",
        "grapevines","greenhouses","sod and equipment""compost","potting soil","garden","vegetation","stump grinder","weed eaters",
        "weeds","lanscaping","mulch wood chips","mulch woodships","landscape machinery","landscaping machinery","chipper","brush",
        "landscape supplies","landscaping supplies","landscape supplies and equipment","landscape products","landscaping products",
        "mowers","lawnmowers","lawnmower","lawn","lawncare equipment","lawn debris","lawncare","leaves","branches","limbs","stumps",
        "yard debris","green waste","landscape debris","landscaping debris","plant materials", "tree wood chips","tree woodchips",
        "trees woodchips","tree chips","landscaping materials soil","landscaping material soil","landscapping materials topsoil",
        "landscaping materials soil","landscaping material soil","landscaping materials soil","wood chips and wood from tree",
        "wood chips and wood from tree work","garden supplies","land","tree","wood chips yard waste","woof chips yard waste","woof chips",
        "topsoil and aggregate","top soil and aggregate","top soil and aggregates","rock and topsoil","rock sand topsoil",
        "dirt lawn equipment","dirt own equipment","sod and equipment","topsoil gravel stone","soil gravel stone","wood chip mulch",
        "woodchips mulch", "lawn equipment and m","agricultural mulch","agriculture mulch","landscaping services","landscaping supply",
        "landscaping items","tree trimmer","vegetative debris","tropical plants","potted plants","greenhouse plants","greenhouse supplies",
        "shrubbery", "shrubery", "shurbs","mulches","evergreens", "greens", "greenery","sap","seedlings","cactus","hardscaping", 
        "hardscaping materials", "hardscape","arborist tools", "arborist","gardener", "gardner","landsc","landscaping eqip",
        "lndscaping eqpt","reee debris","tree debris","rock and soil","rocks and soil","sod and equipment","trees and wheels","weed cont",
        "weed con", "flowers plants","flower plants","landscaping materials and plants","landscape material and equipment",
        "hardscape materials and equipment","land equip",
    ],

    "Dry Foods (Non-Refrigerated)": [
        "bread", "bread and cake", "bread and cakes", "bread delivery", "breads","snacks", "snack foods", "snack cakes", "snack cake", 
        "potato chips","chips", "cookies", "cookies and crackers", "crackers", "pretzels","donuts", "doughnuts", "pastries", "cake", 
        "cakes", "bakery","bakery items", "baked foods", "bagels", "little debbie snack cakes","little debbie snacks", "concessions",
        "concession", "concession trailer","concession trailers", "foodstuff", "foodstuffs", "foods", "dried goods","drygoods","corn oil", 
        "vegetable oil", "groceries", "grocery", "tortillas","sugar beets", "wheat", "candy", "soybeans", "soy beans", "beans", "nuts",
        "almonds", "flour", "beets", "molasses", "onions", "maple syrup","pecans", "sugar", "popcorn", "whey", "food chips", 
        "bakery products","baked goods", "bakery goods", "bread products", "canned goods", "peanuts","potatoes", "spices", "tobacco", 
        "honey", "cooking oil","food and equipment", "private property food", "canned foods","grocery items", "grocery products", 
        "boxed goods", "candies", "chocolate","snack", "snack products", "baked products", "pastry", "canned","can goods", "bake goods", 
        "baking goods", "baking products","packaged foods", "sandwiches", "seasonings", "baked products","bakery delivery","snack products", 
        "pastry", "pizza", "tea", "pickles","walnuts","hazelnuts","salsa", "raisins", "cranberries", "olives","specialty foods","pasta",
        "condiments", "sauces", "vinegar","canola", "peanut hulls","salad dressing", "dressings", "salads","buns", "muffins","edible oils",
        "pasta","specialty foods", "corn and beans","corn and soybeans", "pepperidge farm cookies","pepperiidge farms cookies",
        "corn soybean","corn soybeans","corn and beans","corn and soybeans","dry edible beans","dryedible beans","nuts and almonds",
        "walnuts and almonds","concessions equipment","consessions equipmen","soybean oil","cooking oil",
    ],

    "Roads/Paving": [
        "blacktop","black top","asphault","ashphalt","asphal","asph","hot mix","hot top","road base","roadbase","road material",
        "road bldg material","paving","paving material","sealcoat","seal coat","seal coating","driveway sealer","tar","hotmix",
        "road building material","road bldg material","asphalt sealer","blacktop gravel","black top gravel","rock asphalt","rocks asphalt",
        "rock sand gravel dirt asphalt road bldg materials","rock sand gravel dirt asphalt road building materials","asphalt materials",
        "hot asphalt","pavement","road oil","asphalt mix","asphalt emulsion","asphalt equipment","asphalt sealant","asfault","aspahlt",
        "highway material", "roadway material","asphalt material","asphalt materials","sealcoat asphalt","seal coat asphalt",
    ],

    "Null": [
        "nan","na","n a","none","null","no cargo","not specified","unspecified","unknown","empty","out of business","registrant only",
        "not for hire","no cargo listed","no description","no load","a","c","d","s","ag","mot","co","r","g","m","dsg","srg","eq","fak",
        "rsg","scr","sa","bb","t","e","mots","nothing","inactive","registrant","non business","non commercial","etc","ect","unspec","unk",
        "not operating","no longer operating","not in operation","out of service","not applicable","closed","gr", "dir", "di", "ro", "as",
        "p", "b", "n", "su", "sr", "ca", "f", "x", "psg", "st", "ma", "hm","wa", "pw", "pr", "mu", "et", "pl", "tr", "br", "ba", "la", "so",
        "u", "i", "we", "pa", "fi", "aa", "ab", "ce", "ds", "dr", "ga","sdg", "pp", "mtls", "eqpt", "eqmt", "eqipment", "equiment",
    ],

    "Recyclables": [
        "recyclables","recycling","recycle","scrap","dross","scrubbers","scrap metal", "glass bottles","scrap iron","scrap steel",
        "scrap aluminum","scrap cars","scrap vehicles","metal recycling","paper recycling","plastic recycling","glass recycling",
        "cardboard recycling","aluminum cans","tin cans","bottles","cans","recy","recyclable scrap metal","recycled scrap metals",
        "scrap metals and crushed cars","scrap metal crushed cars","scrad metal","scrap metal s","recyclable materials","recycled materials",
        "recycle metal","recycle metals","recycled metal","recycled metals","scape metal","x scrap metal","x scrap metals","scrap metral",
        "scrp metal","scrap metal 3","recyca","recyced goods","recycla","recyclyn","recylced materials","recylcing collectors",
        "recylcling stuff","recyled material","recyled materials","recyled products","recylepaper","Recylepaper","scrapt metal",
        "recycables","recycleables","recyclable material","recycled material","recycled goods","recyclable","recycled concrete",
        "recycled tires","recycled electronics","plastic","plastics","plastic products","plastic product","plastic parts",
        "plastic pellets","plastic resin","plastic film","plastic packaging","plastic bags","plastic articles","plastic containers",
        "recyced glass","recylce glass",
    ],
    
    "Medical": [
        "medical supplies","medical equipment","hospital supplies","hospital equipment","surgical supplies","surgical equipment",
        "pharmaceuticals","pharmacy","medicines","medicine","prescriptions","prescription drugs","medical waste","biohazard waste",
        "biohazard","biomedical waste","clinical waste","lab specimens","laboratory specimens","medical office supplies",
        "medical office suppl","blood","vitamins","dental equipment", "dental supplies","veterinary equipment", "veterinary supplies",
        "x ray equipment", "x-ray equipment", "xray equipment","ambulance","medication","med supplies","health products","patients",
        "imaging equipment", "rmw","medcial","medcial supplies","medecine","medi","medic","medica couier","medicalmobilemriunit",
        "medicals supplies","medicals supply","medicalscrubs","medicare","medicial equipment","medicinal herbs","medicl supplies",
    ],
    
    "Other": [
        "mixed freight","mixed cargo","misc freight","misc cargo","other freight","other cargo","general cargo","truck driving school",
        "driving school","other","misc","demo","private","personal","personal use","personal property","private property","leasing company",
        "perm leased to another carrier","tools","tool","tools and equipment","tools equipment","tools and supplies","tools of trade",
        "tools of the trade","materials of trade","company tools","company equipment","own equipment","personal tools","personal equipment",
        "service equipment","repair equipment","repair tools","repair parts","small tools","work tools","consumer goods","service",
        "delivery","party supplies","party rentals","rental","private property p","private property us","private property to",
        "own equipment and ma","general merchandise","merchandise","retail goods","retail merchandise","retail","company property",
        "company owned equipment","company tools and equipment","business equipment","business property","other general items",
        "miscellaneous","miscellaneous tools","misc items","snow removal","snow removal equipment","snow plow","snow plows","snowplow",
        "snowplows","snowplowing","snow plowing","snow equipment","party rental equipment","party equipment","event equipment",
        "event supplies","event rental equipment", "telecommunications","telecommunications equipment","telecommunications materials",
        "telecommunication","telecom equipment","telecommunication equipment","telecom materials","telecommunications eq",
        "telecommunication eq","telecom","communications equipment","communication equipment","communication equip","communication equipm",
        "communications equip","cell tower equipment","restaurant supplies", "restuarant supplies","rest equipment","bottling equipment",
        "festival equipment","parade float", "parade floats","event equipment", "event supplies", "emergency supplies","gaming equipment",
        "arcade equipment", "arcade","flea market", "flea market items","moving aids","rent trucks","repo", "repossessions", "repossession",
        "transport", "transportation","cleaning supplies","cleaning equipment","cleaning products","cleaning materials","cleaning tools",
        "janitorial supplies","janitorial equipment","janitorial products","janitorial","carpet cleaning","carpet cleaning equipment",
        "carpet cleaning machine","sanitation","san","sanitation equipment","sanitation supplies","restoration equipment",
        "restoration","pressure washing equipment","pressure washer","pressure washers","power washer","power washing equipment",
        "power washing","street sweeper","street sweepers","street sweeping","sweeper","vacuum truck","vac truck","welder", "welders",
        "weld", "weldments","welder and tools", "tools welder", "welder tools", "tools and welder","exercise equipment", 
        "excercise equipment","gymnastic equipment","racing equipment","firefighting equipment", "firefighting","scientific equipment",
        "printing equipment","sandblast equipment","emergency equipment","emergency response","cdl training","driver training","cdl school",
        "training school","trucking school","courier","courier service","rental company","leasing","lease","bank equipment", 
        "banking equipment","supermarket equipment", "rubber", "rubber products", "rubber rollers","hand tools", "handtools",
        "mechanics tools", "mechanical tools","carpentry tools", "carpenter tools","job tools", "job tools and equipm","shop tools", 
        "shop equipment","mac tools", "matco tools", "snap on tools", "snapon tools","tool sales", "tools for sale", "tools for resale",
        "mobile tool sales","tools and materials", "tools and material","sound equipment", "sound equip","sound systems", "sound system",
        "sound","studio equipment","media equipment","dj", "dj equipment","radio equipment","television equipment","camera equipment",
        "photo equipment", "photography equipment","restaurant equipment", "restaurant equip","restaurant products", "restaurant supply",
        "restaurant supplies", "resturant equipment","resturant supplies", "resturant equipment","catering equipment", "catering supplies",
        "catering truck", "catering trailer","caterer", "catering service","mobile restaurant","cannabis", "cannabis goods", 
        "cannabis products","tents", "tent", "tents and equipment","tent rentals", "rental tents","bounce house", "bounce houses",
        "inflatables","display trailer","tradeshow equipment","exhibit material", "exhibits","display equipment", "display materials", 
        "display material","marketing materials", "marketing supplies","promotional material", "promotional materials","promotional items",
        "promotional products","billboard", "billboards","mobile billboard", "mobile billboards","auction items", "auction", "auctions",
        "flea market", "flea market items","roadside assistance","moving company","messenger service","fire fighting apparatus",
        "firefighting apparatus",
    ],
}

# Build normalized lookup for manual tokens
MANUAL_TOKEN_TO_CATEGORY = {
    normalize_text(t): category
    for category, tokens in MANUAL_CATEGORY_MAP.items()
    for t in tokens
}

# List of all normalized manual tokens for fuzzy matching
MANUAL_TOKEN_LIST = list(MANUAL_TOKEN_TO_CATEGORY.keys())


# ------------------------------------------------------------------
# 2b. Generic "fuzzy ignore" words (for manual fuzzy)
# ------------------------------------------------------------------
FUZZY_GENERIC_WORDS = {
    "equipment", "equip", "equiptment", "equipments", "equipm","material", "materials","supplies", "supply","products", "product",
    "parts", "part","items", "item","goods","tools", "tool","containers", "container","buildings", "building","pallets", "pallet",
    "trailer", "trailers","truck", "trucks","property","merchandise","service", "services","delivery", "deliveries","misc", "miscellaneous",
    "general", "other","non", "haz", "hazardous", "hazmat", "nonhaz", "non-haz", "hm","mobile", "home", "homes", "house", "houses",
    "repair", "repairs","maintenance", "maint","install", "installation","moving", "move","setup", "set", "set up", "race", "racing", 
    "material", "materials", "piping", "pipe", "parts", "component", "components",
}

# ------------------------------------------------------------------
# 2c. Purely generic tokens → "Null" as a last resort
# ------------------------------------------------------------------
GENERIC_ONLY_WORDS = FUZZY_GENERIC_WORDS | {
    "vehicle", "vehicles"
}

def is_all_generic(norm_tok: str) -> bool:
    """True if every word in the token is from our generic word set."""
    if not isinstance(norm_tok, str) or not norm_tok.strip():
        return False
    words = norm_tok.split()
    return all(w in GENERIC_ONLY_WORDS for w in words)



def fuzzy_manual_category(norm_tok: str, cutoff: float = 0.88):
    """
    Fuzzy match a token against MANUAL_TOKEN_TO_CATEGORY,
    but ONLY on non-generic words (so 'welding equipment' → 'welding').

    IMPORTANT: we *disable* fuzzy mapping into "Mobile Homes"
    so that only explicit tokens / phrases land there.
    """
    if not isinstance(norm_tok, str) or not norm_tok.strip():
        return np.nan

    words = [w for w in norm_tok.split() if w not in FUZZY_GENERIC_WORDS]
    if not words:  # all generic words → don't fuzzy-match
        return np.nan

    core = " ".join(words)

    matches = difflib.get_close_matches(core, MANUAL_TOKEN_LIST, n=1, cutoff=cutoff)
    if matches:
        best = matches[0]
        cat = MANUAL_TOKEN_TO_CATEGORY.get(best, np.nan)
        # Do NOT let fuzzy logic assign Mobile Homes
        if cat == "Mobile Homes":
            return np.nan
        return cat

    return np.nan

# ------------------------------------------------------------------
# 3. Keyword / substring mapping (AUTO-GENERATED from MANUAL_CATEGORY_MAP)
# ------------------------------------------------------------------

# Words too generic to be used as keyword substrings OR matching words
KEYWORD_STOPWORDS = {
    "and", "or", "the", "for", "to", "of", "in", "on", "with", "without","a", "an", "by","supplies", "supply", "goods", "product", 
    "products", "item", "items","material", "materials", "equipment", "equip", "equiptment", "equipments","equipm", "tools", "tool", 
    "parts", "part", "service", "services","company", "co", "inc", "llc", "corp", "general", "misc", "miscellaneous","own", "owned", 
    "division", "dept", "department", "property","merchandise", "materials","truck", "trucks", "trailer", "trailers", "tractor", 
    "tractors","hauling", "haul", "hauls", "transport", "transporting", "transportation","delivery", "delivered", "deliveries", "rental", 
    "rentals", "lease","leasing", "container", "containers", "building", "buildings","unit", "units", "load", "loads", "barns","pallet", 
    "pallets", "crate", "crates", "box", "boxes", "bag", "bags","food", "foods", "mix", "shot", "hot","non", "haz", "hazardous", "hazmat", 
    "nonhaz", "non-haz", "hm","mobile", "home", "homes", "house", "houses","repair", "repairs","maintenance", "maint","install",
    "installation","moving", "move","setup", "set", "set up", "race", "racing",
}

tmp_keyword_map = defaultdict(set)

for category, tokens in MANUAL_CATEGORY_MAP.items():
    # IMPORTANT: do NOT generate keyword rules for Null
    if category == "Null":
        continue

    for t in tokens:
        norm = normalize_text(t)
        if not isinstance(norm, str) or not norm:
            continue

        # 1) FULL normalized phrase as a keyword
        tmp_keyword_map[category].add(norm)

        # 2) INDIVIDUAL words, if meaningful
        #    BUT: for "Mobile Homes", we *only* use full phrases to avoid
        #    generic words like 'manufactured', 'frame', 'repair', etc.
        if category == "Mobile Homes":
            continue

        for w in norm.split():
            if w not in KEYWORD_STOPWORDS and len(w) >= 3:
                tmp_keyword_map[category].add(w)

MANUAL_KEYWORD_MAP = {
    category: sorted(list(words)) for category, words in tmp_keyword_map.items()
}

# Extra weighting for especially indicative keywords.
KEYWORD_BOOST = {
    "Landscaping/Lawn Care": {
        "mulch", "mulches", "mulching","soil", "soils", "topsoil", "top soil","sod", "turf", "loam","compost", "potting soil","nursery", 
        "trees", "shrubs","landscape", "landscaping","lawn", "yard","mowing", "mowers", "mower","grass", "clippings","branches", "limbs", 
        "stumps","tree", "bushes",
    },
    
    "Paper Products": {
        "cardboard", "boxes", "box","corrugated","paper", "papers","newspaper", "newspapers","magazine", "magazines","books", "book",
        "documents",
    },
    
    "Recyclables": {
        "recyclables", "recycling", "recycle","scrap", "scrap metal","aluminum cans", "tin cans","bottles", "cans","plastic", "plastics",
    },
    
    "Medical": {
        "medical", "medical supplies", "medical equipment","hospital", "surgical","pharmaceuticals", "pharmacy","medicine", "medicines",
        "medical waste", "biohazard",
    },
    
    "Building Materials": {
        "electrical", "plumbing", "hvac","concrete", "cement", "forms","millwork", "conduit","asphalt", "blacktop", "paving","ductwork", 
        "guardrail", "paint",
    },
    
    "Machinery, Large Objects": {
        "mining", "drill", "drilling","trencher", "crane", "cranes","excavating", "excavation","dozer", "loader", "skid", "skidsteer",
        "boom", "boom truck", "playground", "gym", "athletic", "sports","audio", "visual", "sound", "broadcast", "entertainment", "stage",
    },
    
    "Garbage/Refuse": {
        "waste", "rubble", "trash","demo", "demolition","sludge", "septage", "slurry","industrial waste", "residential waste",
    },
    
    "Other": {
        "janitorial", "cleaning","pressure washing", "power washing","restoration", "sanitation",
    },
    
    "Fresh Produce": {
        "produce", "fresh produce","fruit", "fruits","vegetable", "vegetables", "veggies","apples", "citrus", "grapes","berries", 
        "blueberries", "strawberries", "raspberries", "cherries","melons", "watermelon","tomatoes", "onion", "onions",
    },
    
    "Construction": {
        "construction", "constuction","contracting","construction materials", "construction material","construction equipment", 
        "constr equip", "const equip",
    },
}

def keyword_match_category(norm_tok: str):
    """
    Decide category based on MANUAL_KEYWORD_MAP.
      - Single-word keywords: full-word match
      - Multi-word keywords: substring match
      - Each category accumulates a score; highest wins if score >= 1
    """
    if not isinstance(norm_tok, str) or not norm_tok.strip():
        return np.nan

    token_words = {w for w in norm_tok.split() if w not in KEYWORD_STOPWORDS}
    if not token_words:
        return np.nan

    best_category, best_score = None, 0

    for category, keywords in MANUAL_KEYWORD_MAP.items():
        score = 0
        boost_terms = KEYWORD_BOOST.get(category, set())

        for kw in keywords:
            if " " in kw:
                if kw in norm_tok:  # phrase keyword
                    score += 2
                    if kw in boost_terms:
                        score += 2
            else:
                if kw in token_words:  # full-word keyword
                    score += 1
                    if kw in boost_terms:
                        score += 2

        if score > best_score:
            best_score, best_category = score, category

    return best_category if best_score >= 1 else np.nan

# ------------------------------------------------------------------
# 3b. Meta rules for patterns (e.g., portable toilets → Other)
# ------------------------------------------------------------------
PORTABLE_TOILET_TERMS = [
    "portable toilet", "portable toilets", "porta potty", "porta potties","port a potty", "porta john", "port a john", "porta johns",
    "portable restroom", "portable restrooms","portable rest room", "portable rest rooms",
]

SCRAP_TERMS = ["scrap", "recycle", "recycling", "recyclable", "recyclables"]

SEPTIC_TERMS = [
    "septic", "septc", "leachate", "effluent", "slurry", "condensate"
]

LANDSCAPE_DEBRIS_PATTERNS = ["landscap"]

ASPHALT_TERMS = {
    "asphalt", "asphault", "ashpalt", "asphlat","aspahlt", "ashphalt", "ashhalt", "aphalt","asplt", "asphl", "aspalt", "asph","asphant", 
    "aspht", "aspha", "asphart","blacktop", "black top","hot mix", "hotmix","sealcoat", "seal coat", "sealcoating","paving", "pavement", 
    "asp", "aspal", "ashp", "ashpl"
}

AGGREGATE_TERMS = {
    "dirt", "sand", "gravel", "stone", "rock","aggregate", "aggregates","topsoil", "top soil","fill", "fill dirt","limestone", "limerock",
}

LANDSCAPE_HINT_TERMS = {
    "landscape", "landscaping", "yard", "lawn", "mulch", "sod","garden", "nursery", "tree", "trees", "shrub", "shrubs","mulch", "mul", 
    "soil", "soils",
}

PRODUCED_WATER_PATTERNS = [
    "produce wate","produced wate","produced water","produce water",
]

def meta_rule_category(norm_tok: str):
    """
    Hand-tuned pattern rules that override fuzzy/keywords for clear cases:
      - Scrap/recycling → Recyclables
      - Septic/leachate/effluent/slurry → Liquids/Gases
      - Landscape + debris/waste/trash → Landscaping/Lawn Care
      - Portable toilets → Other
      - Asphalt / hot mix / sealcoat / paving → Roads/Paving
      - Aggregate-only mix (dirt/sand/rock/gravel/topsoil) → Commodities Dry Bulk
      - 'produced water' / 'produce wate[r]' style oilfield brine → Oilfield Equipment
      - "non haz" + "chemical" → Chemicals
    """
    if not isinstance(norm_tok, str) or not norm_tok.strip():
        return np.nan

    text = norm_tok

    # --- 0) Oilfield produced water variants ---------------------------------
    if any(pat in text for pat in PRODUCED_WATER_PATTERNS):
        return "Oilfield Equipment"

    # --- 1) Scrap / recycling -------------------------------------------------
    if any(term in text for term in SCRAP_TERMS):
        return "Recyclables"

    # --- 2) Septic / leachate / effluent / slurry ----------------------------
    if any(term in text for term in SEPTIC_TERMS):
        return "Liquids/Gases"

    # --- 3) Landscape debris/waste -------------------------------------------
    if any(pat in text for pat in LANDSCAPE_DEBRIS_PATTERNS) and (
        "debris" in text or "debri" in text or "waste" in text or "trash" in text
    ):
        return "Landscaping/Lawn Care"

    if any(term in text for term in LANDSCAPE_HINT_TERMS):
        return "Landscaping/Lawn Care"

    # --- 4) Portable toilets → Other -----------------------------------------
    for term in PORTABLE_TOILET_TERMS:
        if term in text:
            return "Other"

    if "clean up" in text or "cleanup" in text:
        return "Garbage/Refuse"

    # --- 5) Asphalt / paving → Roads/Paving ----------------------------------
    if any(term in text for term in ASPHALT_TERMS):
        # e.g. "dirt rock sand asphlat", "dirt rock sand aphalt", etc.
        return "Roads/Paving"

    if "oil field waste" in text:
        return "Oilfield Equipment"

    # --- 6) Aggregate-only mixes → Commodities Dry Bulk ----------------------
    # If we see dirt/sand/gravel/stone/rock/topsoil but NO asphalt/paving
    # and NO strong landscaping hints, treat as dry bulk.
    if (
        any(term in text for term in AGGREGATE_TERMS)
        and not any(term in text for term in ASPHALT_TERMS)
        and not any(term in text for term in LANDSCAPE_HINT_TERMS)
    ):
        return "Commodities Dry Bulk"

    # --- 7) Non-haz chemicals → Chemicals ------------------------------------
    if "chemical" in text and ("non haz" in text or "non-haz" in text or "nonhaz" in text):
        return "Chemicals"

    return np.nan

# ------------------------------------------------------------------
# 4. FINAL category assignment — optimized (per unique token)
# ------------------------------------------------------------------
def assign_final_category_for_token(norm_tok: str, basic_category_map: dict):
    """
    Priority:
      1) Manual EXACT token → category
      2) Manual FUZZY token (non-generic core)
      3) Meta rules
      4) FMCSA basic category
      5) Keyword/substring match
      6) Purely generic tokens → "Null"
      7) Everything else → NaN (left uncategorized)

    NOTE:
      - The manual "Other" category is still used, but ONLY when a token
        explicitly hits your MANUAL_CATEGORY_MAP / keyword logic.
      - We no longer use "Other" as a universal fallback bucket.
    """
    if not isinstance(norm_tok, str):
        return np.nan
    norm_tok = norm_tok.strip()
    if not norm_tok:
        return np.nan

    # 1) Manual exact
    manual_exact = MANUAL_TOKEN_TO_CATEGORY.get(norm_tok)
    if manual_exact is not None:
        return manual_exact

    # 2) Manual fuzzy
    fuzzy_manual = fuzzy_manual_category(norm_tok)
    if pd.notna(fuzzy_manual):
        return fuzzy_manual

    # 3) Meta rules
    meta_cat = meta_rule_category(norm_tok)
    if pd.notna(meta_cat):
        return meta_cat

    # 4) FMCSA basic category
    basic_cat = basic_category_map.get(norm_tok, np.nan)
    
    # --- Do NOT allow FMCSA to assign "Mobile Homes" ---
    if basic_cat == "Mobile Homes":
        basic_cat = np.nan
    
    if pd.notna(basic_cat):
        return basic_cat

    # 5) Keyword-based
    cat = keyword_match_category(norm_tok)
    if pd.notna(cat):
        return cat

    # 6) Purely generic tokens → Null
    if is_all_generic(norm_tok):
        return "Null"

    # 7) Fallback: leave truly unknown tokens uncategorized
    return "Other"

# ------------------------------------------------------------------
# 5. Apply categorization to tokens_df (vectorized via unique tokens)
# ------------------------------------------------------------------

# Ensure norm_token is string/normalized
tokens_df["norm_token"] = tokens_df["norm_token"].astype(str).str.strip()

# Unique normalized tokens (ignoring empty strings)
unique_tokens = (
    tokens_df["norm_token"]
    .replace("", np.nan)
    .dropna()
    .unique()
)

#  Compute basic_category once per unique token
basic_category_map = {
    tok: assign_basic_category(tok)
    for tok in unique_tokens
}

# Attach FMCSA basic category back to the DataFrame
tokens_df["basic_category"] = tokens_df["norm_token"].map(basic_category_map)

# Compute final_category once per unique token, reusing basic_category_map
final_category_map = {
    tok: assign_final_category_for_token(tok, basic_category_map)
    for tok in unique_tokens
}

# Attach final category back to the DataFrame
tokens_df["final_category"] = tokens_df["norm_token"].map(final_category_map)

In [479]:
# ----------------------------------------
# 7. Aggregate to DOT-level categories
# ----------------------------------------

def aggregate_categories(cats: pd.Series) -> str:
    """
    Per-DOT aggregation of final_category:
      - Remove duplicates
      - If 'Null' appears with any other category, drop 'Null'
      - 'Null' is kept only when it is the *only* category
    """
    # Deduplicate and drop any accidental NaNs
    unique = {c for c in cats if pd.notna(c)}

    if not unique:
        return np.nan

    # If we have 'Null' plus other categories, drop 'Null'
    if "Null" in unique and len(unique) > 1:
        unique.discard("Null")

    # Sort for stable / readable output
    ordered = sorted(unique)
    return " | ".join(ordered)


# Keep only tokens that have been assigned a category (including 'Null')
assigned = tokens_df.dropna(subset=["final_category"]).copy()

cargo_by_dot = (
    assigned
    .groupby("dot_number")["final_category"]
    .apply(aggregate_categories)
    .reset_index()
    .rename(columns={"final_category": "cargo_categorized"})
)

# If rerunning: remove any existing cargo_categorized to avoid _x/_y suffixes
if "cargo_categorized" in cargo_df.columns:
    cargo_df = cargo_df.drop(columns=["cargo_categorized"])

# Merge back into the original cargo_df
cargo_df = cargo_df.merge(cargo_by_dot, on="dot_number", how="left")

# Now cargo_df has a 'cargo_categorized' column
cargo_df[["dot_number", "cargo_carried", "cargo_categorized"]].head()


Unnamed: 0,dot_number,cargo_carried,cargo_categorized
0,1,General Freight,General Freight
1,10000,Liquids/Gases,Liquids/Gases
2,1000000,"Metal; Sheets, Coils, Rolls, Logs, Poles, Beam...",Building Materials | Construction | Garbage/Re...
3,1000002,"Metal; Sheets, Coils, Rolls, Construction","Construction | Metal: sheets, coils, rolls"
4,1000004,"Grain, Feed, Hay, Paper Products, Farm Supplies","Agricultural/Farm Supplies | Grain, Feed, Hay ..."


In [481]:
cargo_df.to_parquet('cargo_with_categories.parquet', engine='pyarrow', compression='snappy')

In [None]:
### EVERYTHING BELOW MEANT FOR FINE TUNING

In [466]:
# ----------------------------------------
# 8. Tokens not yet assigned to any category
# ----------------------------------------

# Total token rows (each DOT–token instance)
total_token_rows = len(tokens_df)

# How many of those rows are categorized
categorized_token_rows = tokens_df[tokens_df["final_category"].notna()].shape[0]
pct_rows_categorized = (categorized_token_rows / total_token_rows) * 100

# Unique normalized tokens
unique_tokens_total = tokens_df["norm_token"].nunique()

# Unique tokens assigned
unique_tokens_assigned = (
    tokens_df[tokens_df["final_category"].notna()]["norm_token"].nunique()
)

# Unique tokens unassigned
unique_tokens_unassigned = unique_tokens_total - unique_tokens_assigned
pct_unique_unassigned = (unique_tokens_unassigned / unique_tokens_total) * 100
pct_unique_assigned = (unique_tokens_assigned / unique_tokens_total) * 100

# Build summary table of unassigned tokens
unassigned_tokens = tokens_df[tokens_df["final_category"].isna()].copy()

unassigned_summary = (
    unassigned_tokens
    .groupby("norm_token")["dot_number"]
    .nunique()
    .reset_index()
    .rename(columns={"dot_number": "dot_count"})
    .sort_values("dot_count", ascending=False)
)

# ------------------------------
# PRINT SUMMARY STATS
# ------------------------------
print("\n================ CATEGORY COVERAGE SUMMARY ================")
print(f"Total token rows:                           {total_token_rows:,}")
print(f"Token rows categorized:                     {categorized_token_rows:,}")
print(f"Percent token rows categorized:             {pct_rows_categorized:,.2f}%")
print("------------------------------------------------------------")
print(f"Unique tokens:                              {unique_tokens_total:,}")
print(f"Unique tokens assigned:                     {unique_tokens_assigned:,}")
print(f"Unique tokens unassigned:                   {unique_tokens_unassigned:,}")
print(f"Percent unique tokens assigned:             {pct_unique_assigned:,.2f}%")
print(f"Percent unique tokens unassigned:           {pct_unique_unassigned:,.2f}%")
print("============================================================\n")

print("Top unassigned tokens:")
display(unassigned_summary.head(50))

# Save output
unassigned_summary.to_csv("unassigned_summary.csv", index=False)


Total token rows:                           6,169,287
Token rows categorized:                     6,169,280
Percent token rows categorized:             100.00%
------------------------------------------------------------
Unique tokens:                              72,354
Unique tokens assigned:                     72,353
Unique tokens unassigned:                   1
Percent unique tokens assigned:             100.00%
Percent unique tokens unassigned:           0.00%

Top unassigned tokens:


Unnamed: 0,norm_token,dot_count
0,,7


In [467]:
# ----------------------------------------
# Token → final_category (one row per token)
# ----------------------------------------
cols = ["raw_token", "norm_token", "final_category"]
if "basic_category" in tokens_df.columns:
    cols.insert(2, "basic_category")

token_category_summary = (
    tokens_df[cols]
    .drop_duplicates()
    .sort_values(["final_category", "norm_token"])
)

token_category_summary.head(50)
token_category_summary.to_csv("token_category_summary.csv", index=False)

In [468]:
# ==========================================================
# CATEGORY COVERAGE COUNTS
# ==========================================================

# Per-category counts
category_counts = (
    tokens_df
    .groupby("final_category")
    .agg(
        row_count=("norm_token", "size"),              # total token rows in this category
        dot_count=("dot_number", "nunique"),           # distinct DOTs in this category
        unique_tokens=("norm_token", "nunique"),       # distinct tokens in this category
    )
    .reset_index()
    .sort_values("row_count", ascending=False)
)

print("Category counts (sorted by row_count):")
display(category_counts)

# Optionally save to CSV
# category_counts.to_csv("category_counts.csv", index=False)

Category counts (sorted by row_count):


Unnamed: 0,final_category,row_count,dot_count,unique_tokens
11,General Freight,898730,897763,217
18,"Logs, Poles, Beams, Lumber",705421,179371,819
12,"Grain, Feed, Hay",547830,183189,690
19,"Machinery, Large Objects",531610,270324,3289
22,"Metal: sheets, coils, rolls",430539,109997,1195
2,Building Materials,388398,375720,9003
6,Construction,357569,356772,847
5,Commodities Dry Bulk,280048,199733,7416
10,Garbage/Refuse,266432,94156,1983
24,Motor Vehicles,171362,168200,2589


In [455]:
# ==========================================================
# AUDIT: FIND TOKEN MISCLASSIFICATIONS / CATEGORY CONFLICTS
# (ignores "Other" and "Null")
# ==========================================================
import difflib

# Copy your token summary
df = token_category_summary.copy()

# Remove Null / Other from consideration
exclude = {"Other", "Null"}
df2 = df.dropna(subset=['final_category'])
df2 = df2[~df2['final_category'].isin(exclude)].copy()

# Sample up to 20,000 (or fewer)
sample = df2.sample(n=min(20000, len(df2)), random_state=42)

# Map norm_token → final category
mapping = dict(zip(sample['norm_token'], sample['final_category']))
tokens = list(mapping.keys())

issues = []

for t in tokens:
    # find fuzzy matches among the sampled tokens
    matches = difflib.get_close_matches(t, tokens, n=6, cutoff=0.9)
    for m in matches:
        if m != t:
            cat_t = mapping[t]
            cat_m = mapping[m]

            # Skip if either is Other/Null (already filtered), or if same category
            if cat_t != cat_m:
                issues.append((t, cat_t, m, cat_m))

issues_df = (
    pd.DataFrame(issues,
        columns=["token_a", "category_a", "token_b", "category_b"]
    )
    .drop_duplicates()
    .sort_values(["category_a", "token_a"])
)

print(f"Total conflicts found (excluding Other/Null): {len(issues_df)}")
display(issues_df)

# Optionally save
# issues_df.to_csv("token_conflicts_filtered.csv", index=False)


Total conflicts found (excluding Other/Null): 622


Unnamed: 0,token_a,category_a,token_b,category_b
435,agri lime,Agricultural/Farm Supplies,agr lime,Commodities Dry Bulk
436,agri lime,Agricultural/Farm Supplies,agi lime,Commodities Dry Bulk
232,agricultral products,Agricultural/Farm Supplies,agricultural produce,Fresh Produce
158,agricultural c,Agricultural/Farm Supplies,agricultural rock,Commodities Dry Bulk
227,agricultural pro,Agricultural/Farm Supplies,agricultural rock,Commodities Dry Bulk
...,...,...,...,...
118,mechanical service truck,Utilities,mechanic service truck,Drive/Tow away
119,mechanical service truck,Utilities,mechanic service trk,Drive/Tow away
69,mechanics service truck,Utilities,mechanic service truck,Drive/Tow away
70,mechanics service truck,Utilities,mechanic service trk,Drive/Tow away


In [457]:
#issues_df.to_csv("token_conflicts_3000sample.csv", index=False)