In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.express as px

In [2]:
# ---  Parameter Configuration ---
params = {
    'class_filter': ['A'],         # Interstate carriers
    'max_trucks': 50,            # Upper limit for high-quality small fleets
    'min_dqs': 0.6,              # Minimum Data Quality Score threshold
    'exclude_companies': ['FEDEX', 'UPS', 'AMAZON', 'WALMART', 'USPS'],
}


In [3]:
# --- Load Data ---
def load_data(file_path: str) -> pd.DataFrame:
    if file_path.endswith('.parquet'):
        df = pd.read_parquet(file_path)
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file type. Please provide .parquet or .csv file.")

    df.columns = df.columns.str.lower()
    print(f"Loaded {len(df)} records from {file_path}")
    return df

In [4]:
# --- Base Filtering (Pre-DQS) ---
def base_filter_companies(df: pd.DataFrame, params: dict) -> pd.DataFrame:
    """Apply initial filters before joining with DQS dataset."""
    df = df.copy()

    # Normalize company names
    if 'legal_name' in df.columns:
        df['legal_name'] = df['legal_name'].astype(str).str.upper().fillna('UNKNOWN')

    # Filter by class
    if 'carrier_operation' in df.columns:
        df = df[df['carrier_operation'].isin(params['class_filter'])]

    # Filter by number of trucks
    if 'nbr_power_unit' in df.columns:
        df = df[df['nbr_power_unit'] < params['max_trucks']]

    # Exclude self-insured or large carriers
    df = df[~df['legal_name'].isin(params['exclude_companies'])]

    print(f"Base filter applied: {len(df)} companies remain before DQS join.")
    return df

In [5]:
# --- Join with DQS Dataset ---
def join_with_dqs(df_base: pd.DataFrame, dqs_df: pd.DataFrame) -> pd.DataFrame:
    """Join filtered dataset with DQS data using dot_number."""
    if 'dot_number' not in df_base.columns or 'dot_number' not in dqs_df.columns:
        raise KeyError("dot_number column must exist in both datasets for join.")

    merged = df_base.merge(dqs_df, on='dot_number', how='left', suffixes=('', '_DQS'))
    print(f"Joined with DQS data: {len(merged)} records after merge.")
    return merged

In [6]:
# --- Post-DQS Filtering ---
def filter_by_dqs(df: pd.DataFrame, params: dict) -> pd.DataFrame:
    """Filter dataset based on DQS threshold after join."""
    if 'DQS' not in df.columns:
        print("No DQS column found; skipping DQS filtering.")
        return df

    df = df[df['DQS'] >= params['min_dqs']]
    df['selected_flag'] = True
    print(f"Post-DQS filtering applied: {len(df)} companies remain.")
    return df

In [7]:
# --- State-Level Aggregation(Without DQS) ---
def aggregate_by_state_basic(df: pd.DataFrame) -> pd.DataFrame:
    """Aggregate TAM statistics by state (without DQS)."""
    if 'phy_state' not in df.columns:
        print("No 'phy_state' column found in dataset.")
        return pd.DataFrame()

    grouped = df.groupby('phy_state').agg(
        TAM_Companies=('legal_name', 'count'),
        TAM_Trucks=('nbr_power_unit', 'sum')
    ).reset_index()
    return grouped

In [8]:
# --- State-Level Aggregation ---
def aggregate_by_state_with_dqs(df: pd.DataFrame) -> pd.DataFrame:
    """Aggregate TAM statistics by state (including DQS)."""
    if 'phy_state' not in df.columns:
        print("No 'phy_state' column found in dataset.")
        return pd.DataFrame()

    grouped = df.groupby('phy_state').agg(
        TAM_Companies=('legal_name', 'count'),
        TAM_Trucks=('nbr_power_unit', 'sum'),
        Avg_DQS=('DQS', 'mean')
    ).reset_index()
    return grouped

In [9]:
# --- visualization(without dqs) ---
def visualize_tam_us_map(df_state: pd.DataFrame):
    df_state = df_state.copy()

    if df_state.empty:
        print("No state-level data to visualize.")
        return

    df_state['TAM_Share_Value'] = df_state['TAM_Companies'] / df_state['TAM_Companies'].sum() * 100
    df_state['TAM_Share_Percent'] = df_state['TAM_Share_Value'].map(lambda x: f"{x:.2f}%")

    fig = px.choropleth(
        df_state,
        locations='phy_state',
        locationmode="USA-states",
        color='TAM_Share_Value',
        color_continuous_scale='Greens',
        scope="usa",
        hover_name='phy_state',
        hover_data={
            'phy_state': False,
            'TAM_Companies': True,
            'TAM_Trucks': True,
            'TAM_Share_Percent': True,
            'TAM_Share_Value': False
        },
        title='Market Distribution of High-Quality Carriers Across the US'
    )

    fig.update_layout(
        geo=dict(bgcolor='rgba(0,0,0,0)', lakecolor='white'),
        font=dict(color='black'),
        coloraxis_colorbar=dict(title='% of Total TAM'),
        margin=dict(l=0, r=0, t=50, b=0)
    )

    fig.show()



In [10]:
# --- Main Execution Flow ---
def main(base_file: str, dqs_file: str):
    df_base = load_data(base_file)
    dqs_df = load_data(dqs_file)

    filtered_base = base_filter_companies(df_base, params)
    merged_df = join_with_dqs(filtered_base, dqs_df)
    final_df = filter_by_dqs(merged_df, params)

    # Save outputs
    final_df.to_csv('filtered_companies_with_dqs.csv', index=False)
    state_summary.to_csv('state_summary.csv', index=False)
    print("Output files generated: filtered_companies_with_dqs.csv, state_summary.csv")


In [11]:
# ---  Temporary Main (Without DQS Join) ---
def main_basic(file_path: str):
    """Run base TAM filtering and visualization without DQS integration."""
    df = load_data(file_path)
    filtered = base_filter_companies(df, params)
    state_summary = aggregate_by_state_basic(filtered)
    visualize_tam_us_map(state_summary)

    filtered.to_csv('filtered_companies_basic.csv', index=False)
    state_summary.to_csv('state_summary_basic.csv', index=False)
    print("Output files generated: filtered_companies_basic.csv, state_summary_basic.csv")


In [12]:
main_basic("transportation_data_20250917_222245.parquet")

Loaded 2091643 records from transportation_data_20250917_222245.parquet
Base filter applied: 683500 companies remain before DQS join.


Output files generated: filtered_companies_basic.csv, state_summary_basic.csv
