In [2]:
# === Imports: Core libraries and ML components used throughout the pipeline ===
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# 8. Apply Production Model to Full Dataset

We preprocess the full dataset to match the exact training schema, apply the
production target-encoding maps, enforce column alignment, and generate ML scores
for downstream dashboard integration.


In [3]:
# === Prepare sample subset from full dataset for ML score generation ===
cargo = pd.read_parquet('cargo_multi_hot_fast.parquet')
df_full = pd.read_parquet("transportation_data_20250917_222245.parquet")
df_pred = df_full.copy()
df_pred = df_pred.merge(cargo, on="dot_number", how="left")
dot_numbers_raw = df_pred['dot_number'].copy()

In [4]:
model = pickle.load(open("final_model.pkl", "rb"))
feature_cols = pickle.load(open("feature_cols.pkl", "rb"))
te_maps = pickle.load(open("te_maps.pkl", "rb"))
target_cols = pickle.load(open("target_cols.pkl", "rb"))

In [5]:
# === Apply production-ready target encoding maps to new prediction data ===
def apply_final_te(df_pred, te_maps, target_cols):
    """
    Apply final (full-data) target encoding mappings to a new dataframe.

    Parameters
    ----------
    df_pred : pandas.DataFrame
        Input dataframe for prediction.
    te_maps : dict
        Dictionary of target-encoding maps learned from full training data.
    target_cols : list
        List of categorical columns that require target encoding.

    Returns
    -------
    df2 : pandas.DataFrame
        Dataframe with target-encoded columns appended.
    """
    df2 = df_pred.copy()
    for col in target_cols:
        mapping = te_maps[col]
        df2[col + "_TE"] = df2[col].astype(str).map(mapping).fillna(mapping["__GLOBAL__"])
    return df2

In [6]:
# === Preprocess prediction data to align with the training feature schema ===

# Drop the same identifier and non-predictive columns used in training
drop_cols = [
    'legal_name', 'dba_name', 'telephone', 'fax', 'email_address',
    'phy_street', 'phy_city', 'phy_zip', 'phy_country',
    'mailing_street', 'mailing_city', 'mailing_zip', 'mailing_country'
]
df_pred = df_pred.drop(columns=[c for c in drop_cols if c in df_pred.columns], errors='ignore')

# Convert date fields to numerical 'days since' representation
for col in ['mcs150_date', 'add_date']:
    if col in df_pred.columns:
        df_pred[col] = pd.to_datetime(df_pred[col], errors='coerce')
        df_pred[col] = (pd.Timestamp.today() - df_pred[col]).dt.days

# Normalize boolean-like fields into binary 0/1 format
bool_cols = [
    'authorized_for_hire', 'exempt_for_hire', 'private_only', 'private_property',
    'private_passenger_business', 'private_passenger_nonbusiness', 'migrant', 'us_mail',
    'federal_government', 'state_government', 'local_government',
    'indian_tribe', 'op_other', 'pc_flag', 'hm_flag'
]

for col in bool_cols:
    if col in df_pred.columns:
        df_pred[col] = (
            df_pred[col]
            .astype(str)
            .str.strip()
            .str.lower()
            .map({'true': 1, 'false': 0, '1': 1, '0': 0})
            .fillna(0)
            .astype(int)
        )

  df_pred[col] = pd.to_datetime(df_pred[col], errors='coerce')
  df_pred[col] = pd.to_datetime(df_pred[col], errors='coerce')


In [7]:
# === Apply final target-encoding mappings to prediction data ===
df_pred2 = apply_final_te(df_pred, te_maps, target_cols)
df_pred2 = df_pred2.drop(columns=target_cols)

In [8]:
# === One-hot encode carrier_operation in prediction data (consistent with training schema) ===
if 'carrier_operation' in df_pred2.columns:
    co_ohe_pred = pd.get_dummies(
        df_pred2['carrier_operation'].astype(str).fillna("MISSING"),
        prefix='co'
    )
    df_pred2 = pd.concat(
        [df_pred2.drop(columns=['carrier_operation']), co_ohe_pred],
        axis=1
    )

In [9]:
# === Align prediction feature columns to the exact training feature schema ===
df_pred2 = df_pred2.fillna(0)

for col in feature_cols:
    if col not in df_pred2.columns:
        df_pred2[col] = 0

extra_cols = [c for c in df_pred2.columns if c not in feature_cols]
df_pred2 = df_pred2.drop(columns=extra_cols, errors='ignore')

float_cols = df_pred2.select_dtypes(include=["float64"]).columns
df_pred2[float_cols] = df_pred2[float_cols].astype("float32")


df_pred2 = df_pred2[feature_cols]

In [10]:
# === Generate ML scores using the final production logistic regression model ===
df_pred2["ml_score"] = model.predict_proba(df_pred2)[:, 1]


df_output = pd.DataFrame({
    "dot_number": dot_numbers_raw,
    "ml_score": df_pred2["ml_score"]
})

df_output.to_csv("company_ml_scores.csv", index=False)