In [1]:
# === Imports: Core libraries and ML components used throughout the pipeline ===
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# 1. Load Annotated Samples and Full Dataset

We load the two manually annotated sample datasets and the full FMCSA carrier dataset.
The annotated samples serve as ground-truth labels, and the full dataset is used later
for production scoring.

In [2]:
# === Load source datasets: annotated samples and original full dataset ===
df1 = pd.read_csv('sample_annotated_400.csv')
df2 = pd.read_csv('sample_annotated_100.csv')
df3 = pd.read_csv('sample_annotated_494.csv')
df_ori = pd.read_parquet('transportation_data_20250917_222245.parquet')
cargo = pd.read_parquet('cargo_multi_hot_fast.parquet')

In [3]:
# === Identify common feature columns shared by both annotated sample datasets ===
common_cols = list(set(df1.columns) & set(df2.columns) & set(df3.columns))
print("Common sample columns:", common_cols)

Common sample columns: ['vmt_source_id', 'dot_number', 'indian_tribe', 'mailing_state', 'fax', 'phy_street', 'phy_country', 'exempt_for_hire', 'federal_government', 'nbr_power_unit', 'legal_name', 'private_only', 'private_passenger_business', 'mailing_country', 'oic_state', 'state_government', 'op_other', 'mcs150_date', 'private_passenger_nonbusiness', 'add_date', 'pc_flag', 'recent_mileage', 'private_property', 'mailing_city', 'mailing_zip', 'phy_state', 'phy_zip', 'mcs150_mileage', 'phy_city', 'local_government', 'dba_name', 'mailing_street', 'email_address', 'migrant', 'recent_mileage_year', 'us_mail', 'expert_label', 'mcs150_mileage_year', 'driver_total', 'telephone', 'authorized_for_hire']


In [4]:
# === Append three manually selected DOT samples and align their columns/labels ===
dot_numbers = [3493401, 759281, 2030937]
subset_df = df_ori[df_ori['dot_number'].isin(dot_numbers)].copy()
sample_cols = common_cols

for col in sample_cols:
    if col not in subset_df.columns:
        subset_df[col] = None

subset_df = subset_df[sample_cols]

label_mapping = {
    3493401: 'GOOD',
    759281: 'BAD',
    2030937: 'GOOD'
}
subset_df['expert_label'] = subset_df['dot_number'].map(label_mapping)


# 2. Combine Annotated Samples and Manually Selected DOT Entries

We unify the schema across the two annotated samples and append three additional
DOT-number samples selected for coverage of rare carrier types.  
This produces a consolidated labeled dataset for model training.


In [5]:
# === Merge annotated samples and manually added DOT entries on aligned columns ===
df1_filtered = df1[common_cols]
df2_filtered = df2[common_cols]
df3_filtered = df3[common_cols]

merged_df = pd.concat([df1_filtered, df2_filtered, df3_filtered, subset_df], axis=0, ignore_index=True)
merged_df = merged_df.merge(cargo, on="dot_number", how="left")

print("Merged (503 rows):", merged_df.shape)

Merged (503 rows): (997, 77)


In [6]:
# === Enrich merged samples with selected attributes from the full dataset ===
parquet_subset = df_ori[['dot_number', 'hm_flag', 'carrier_operation']].drop_duplicates('dot_number')

merged_df = merged_df.merge(parquet_subset, on='dot_number', how='left')

print("After adding hm_flag + operation:", merged_df.shape)
print("hm_flag exists?", 'hm_flag' in merged_df.columns)
print("carrier_operation exists?", 'carrier_operation' in merged_df.columns)

After adding hm_flag + operation: (997, 79)
hm_flag exists? True
carrier_operation exists? True


# 3. Feature Preprocessing

This section performs preprocessing steps to clean and standardize the features:
- remove identifier and non-predictive columns  
- convert date columns into numerical "days since" values  
- normalize boolean-like fields into binary indicators (0/1)  
- standardize expert labels to a binary target (bad = 0)

These steps ensure a consistent, numeric feature space suitable for modeling.


In [7]:
# === Remove identifier and non-predictive columns from the merged dataset ===
drop_cols = [
    'dot_number', 'legal_name', 'dba_name',
    'telephone', 'fax', 'email_address',
    'phy_street', 'phy_city', 'phy_zip', 'phy_country',
    'mailing_street', 'mailing_city', 'mailing_zip', 'mailing_country'
]

df = merged_df.drop(columns=drop_cols, errors='ignore')

In [8]:
# === Convert date columns into numerical 'days since' features ===
for col in ['mcs150_date', 'add_date']:
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[col] = (pd.Timestamp.today() - df[col]).dt.days

  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')


In [9]:
# === Normalize boolean-like categorical fields into binary 0/1 features ===
bool_cols = [
    'authorized_for_hire', 'exempt_for_hire', 'private_only', 'private_property',
    'private_passenger_business', 'private_passenger_nonbusiness', 'migrant', 'us_mail',
    'federal_government', 'state_government', 'local_government',
    'indian_tribe', 'op_other', 'pc_flag', 'hm_flag'
]

for col in bool_cols:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.lower()
            .map({'true': 1, 'false': 0})
            .fillna(0)
            .astype(int)
        )

In [10]:
# === Standardize expert labels into a binary target variable (0 = bad) ===
df['expert_label'] = (
    df['expert_label']
    .astype(str)
    .str.strip()
    .str.upper()
    .map(lambda x: 1 if x in ['GOOD', 'GREAT', 'OK'] else 0)
)

# 4. Leakage-Free Target Encoding (State Variables)

We apply stratified K-fold target encoding with smoothing to state-related categorical
columns.  
This prevents label leakage and produces robust numerical embeddings that combine
category-level behavior with global mean smoothing.


In [11]:
# === K-Fold target encoding (leakage-free) with smoothing for categorical variables ===
def kfold_target_encoding(df, col, target, n_splits=5, smoothing=5):
    """
    Perform leakage-free K-fold target encoding with smoothing for a categorical column.

    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe containing both features and target.
    col : str
        Categorical column to be target-encoded.
    target : str
        Name of the binary target variable.
    n_splits : int, default=5
        Number of folds for stratified K-fold splitting.
    smoothing : int or float, default=5
        Smoothing factor balancing category mean and global mean.

    Returns
    -------
    oof : pandas.Series
        Out-of-fold target-encoded values for training (no leakage).
    final_map : dict
        Full-sample encoding dictionary used for inference / production deployment.
    """
    global_mean = df[target].mean()
    oof = pd.Series(index=df.index, dtype=float)

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, val_idx in kf.split(df, df[target]):
        df_train = df.iloc[train_idx]
        df_val = df.iloc[val_idx]

        stats = df_train.groupby(col)[target].agg(['count', 'mean'])
        stats['te'] = (stats['count'] * stats['mean'] + smoothing * global_mean) / \
                      (stats['count'] + smoothing)

        oof.iloc[val_idx] = df_val[col].map(stats['te']).fillna(global_mean)

    # Final encoding map for production inference
    full_stats = df.groupby(col)[target].agg(['count', 'mean'])
    full_stats['te'] = (full_stats['count'] * full_stats['mean'] + smoothing * global_mean) / \
                       (full_stats['count'] + smoothing)

    final_map = full_stats['te'].to_dict()
    final_map["__GLOBAL__"] = global_mean

    return oof, final_map

In [12]:
# === Apply K-fold target encoding to state-related categorical columns ===
state_cols = ['phy_state', 'mailing_state', 'oic_state']
target_cols = state_cols

te_train = pd.DataFrame(index=df.index)
te_maps = {}

for col in target_cols:
    print("Encoding:", col)
    oof_te, final_map = kfold_target_encoding(df, col, 'expert_label', n_splits=5, smoothing=5)
    te_train[col + "_TE"] = oof_te
    te_maps[col] = final_map

# Replace original state columns with their target-encoded versions
df_te = df.drop(columns=target_cols)
df_te = pd.concat([df_te, te_train], axis=1)

print("df_te shape:", df_te.shape)

Encoding: phy_state
Encoding: mailing_state
Encoding: oic_state
df_te shape: (997, 65)


# 5. Construct Final Training Dataset (X, y)

After applying target encoding and one-hot encoding, we assemble the final feature
matrix (X) and binary target vector (y).  
This dataset is used for both cross-validation and hold-out evaluation.


In [13]:
# === One-hot encode the carrier_operation field ===
df_te = df_te.fillna(0)

if 'carrier_operation' in df_te.columns:
    co_ohe = pd.get_dummies(
        df_te['carrier_operation'].astype(str).fillna("MISSING"),
        prefix='co'
    )
    df_te = pd.concat(
        [df_te.drop(columns=['carrier_operation']), co_ohe],
        axis=1
    )

In [14]:
# === Construct final feature matrix (X) and target vector (y) ===
X = df_te.drop(columns=['expert_label'])
y = df_te['expert_label']

print("Final X shape:", X.shape)
print("Final y balance:")
print(y.value_counts())

Final X shape: (997, 67)
Final y balance:
expert_label
0    576
1    421
Name: count, dtype: int64


In [15]:
# === Create hold-out test split and define stratified cross-validation scheme ===
from sklearn.model_selection import train_test_split, StratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 6. Logistic Regression Model with Hyperparameter Tuning

We train a logistic regression model using a full preprocessing pipeline and
perform grid search over L1/L2 penalties and regularization strengths.  
A stratified hold-out test split is used to measure generalization performance.


In [16]:
# === Logistic regression pipeline and hyperparameter tuning via GridSearchCV ===
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(max_iter=6000))
])

param_grid = {
    'lr__C': [0.01, 0.1, 1, 3, 10],
    'lr__penalty': ['l1', 'l2'],
    'lr__solver': ['liblinear']
}

grid = GridSearchCV(
    pipe,
    param_grid,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best AUC:", grid.best_score_)
print("Best Params:", grid.best_params_)

Best AUC: 0.881570408825438
Best Params: {'lr__C': 1, 'lr__penalty': 'l1', 'lr__solver': 'liblinear'}


In [17]:
# === Evaluate the best logistic regression model on the hold-out test set ===
from sklearn.metrics import roc_auc_score

test_pred = grid.best_estimator_.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_pred)

print("Hold-out Test AUC:", test_auc)

Hold-out Test AUC: 0.8748460591133005


# 7. Train Final Production Model and Extract Feature Importance

Using the best hyperparameters found during cross-validation, we retrain the model
on the entire labeled dataset to obtain the final production model.  
We then compute feature importances (raw coefficients, absolute magnitudes, and
normalized percentages) to support interpretability and dashboard visualization.


In [18]:
# === Build final training dataset using full data and production-ready TE mappings ===

df_final = df.copy()

# Apply target encoding maps (learned from training stage) to full dataset
for col in target_cols:
    mapping = te_maps[col]
    df_final[col + "_TE"] = df_final[col].astype(str).map(mapping).fillna(mapping["__GLOBAL__"])

df_final = df_final.drop(columns=target_cols)

# One-hot encode carrier_operation in the final dataset (must match training schema)
if 'carrier_operation' in df_final.columns:
    co_ohe_final = pd.get_dummies(
        df_final['carrier_operation'].astype(str).fillna("MISSING"),
        prefix='co'
    )
    df_final = pd.concat(
        [df_final.drop(columns=['carrier_operation']), co_ohe_final],
        axis=1
    )

df_final = df_final.fillna(0)

# Final feature matrix and target for full-data model training
X_final = df_final.drop(columns=['expert_label'])
y_final = df_final['expert_label']

feature_cols = X_final.columns.tolist()

# Train logistic regression with best hyperparameters on full dataset
final_model = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(
        C=grid.best_params_['lr__C'],
        penalty=grid.best_params_['lr__penalty'],
        solver='liblinear',
        max_iter=8000
    ))
])

final_model.fit(X_final, y_final)

In [19]:
pickle.dump(final_model, open("final_model.pkl", "wb"))
pickle.dump(feature_cols, open("feature_cols.pkl", "wb"))
pickle.dump(te_maps, open("te_maps.pkl", "wb"))
pickle.dump(target_cols, open("target_cols.pkl", "wb"))

In [20]:
# === Extract feature importance (raw and absolute coefficients) from the final LR model ===
lr_model = final_model.named_steps['lr']
coefs = lr_model.coef_[0]

feature_importance = (
    pd.DataFrame({
        'feature': feature_cols,
        'coef': coefs,
        'abs_coef': np.abs(coefs)
    })
    .sort_values('abs_coef', ascending=False)
)

In [21]:
# === Normalize feature importance and assign color tags for dashboard visualization ===
total_abs = feature_importance['abs_coef'].sum()

feature_importance['percentage'] = (
    feature_importance['abs_coef'] / total_abs * 100
)

def coef_to_color(c):
    if c > 0:
        return 'green'
    elif c < 0:
        return 'red'
    else:
        return 'grey'

feature_importance['color'] = feature_importance['coef'].apply(coef_to_color)

print(feature_importance.head(20))

                          feature      coef  abs_coef  percentage  color
38               Grain, Feed, Hay -1.318602  1.318602    7.522585    red
44           Commodities Dry Bulk -0.994798  0.994798    5.675291    red
65                           co_C  0.969003  0.969003    5.528129  green
45              Refrigerated Food -0.884646  0.884646    5.046877    red
32                  Fresh Produce -0.858099  0.858099    4.895425    red
35                     Passengers -0.854322  0.854322    4.873880    red
26                 Motor Vehicles -0.833832  0.833832    4.756984    red
49     Agricultural/Farm Supplies -0.777585  0.777585    4.436097    red
12                        pc_flag -0.688094  0.688094    3.925555    red
41                 Garbage/Refuse -0.550758  0.550758    3.142060    red
37                      Livestock -0.541266  0.541266    3.087904    red
36             Oilfield Equipment -0.468293  0.468293    2.671598    red
22            authorized_for_hire  0.429645  0.4296