In [2]:
import os

file_cities = os.path.join('./data','cities_data.csv')
df_cities = pd.read_csv(file_cities)

In [4]:
df_cities.columns


Index(['city_from_name;city_to_name;city_from_coord;city_to_coord;distance'], dtype='object')

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import math
from catboost import CatBoostClassifier

# ============================================================
# 1. LOAD MODEL & DATA
# ============================================================

@st.cache_resource
def load_model():
    model = CatBoostClassifier()
    model.load_model("catboost_late_model.cbm")  # your trained model
    return model

model = load_model()

@st.cache_resource
def load_distances():
    # Must have: from_city, to_city, distance_km, lat_from, lon_from, lat_to, lon_to
    return pd.read_csv(file_cities, sep=';')

df_distances = load_distances()


# ============================================================
# 2. DICTIONARIES (LPI + CITY ‚Üí COUNTRY)
# ============================================================

LPI_COUNTRY = {
    "Netherlands": 4.02, "Greece": 3.20, "Spain": 3.83, "France": 3.84,
    "Italy": 3.74, "Germany": 4.20, "Czechia": 3.68, "Portugal": 3.56,
    "Austria": 4.03, "Sweden": 4.05, "Hungary": 3.42, "Finland": 3.97,
    "Denmark": 3.99, "Romania": 3.12, "Slovakia": 3.03, "Poland": 3.54,
    "Belgium": 4.04
}

CITY_TO_COUNTRY = {
    "Rotterdam": "Netherlands", "Athens": "Greece", "Barcelona": "Spain",
    "Berlin": "Germany", "Milan": "Italy", "Madrid": "Spain",
    "Vienna": "Austria", "Paris": "France", "Munich": "Germany",
    "Amsterdam": "Netherlands", "Stockholm": "Sweden", "Copenhagen": "Denmark",
    "Lyon": "France", "Cologne": "Germany", "Rome": "Italy",
    "Marseille": "France", "Bucharest": "Romania", "Budapest": "Hungary",
    "Naples": "Italy", "Hanover": "Germany", "Malm√∂": "Sweden",
    "Turin": "Italy", "Lisbon": "Portugal", "Valencia": "Spain",
    "Prague": "Czechia", "Bordeaux": "France", "Bremen": "Germany",
    "Helsinki": "Finland", "Porto": "Portugal", "Venlo": "Netherlands",
    "Hamburg": "Germany", "Warsaw": "Poland", "Dusseldorf": "Germany",
    "Lille": "France", "Zaragoza": "Spain", "Liege": "Belgium",
    "Bratislava": "Slovakia"
}

def get_country(city):
    return CITY_TO_COUNTRY.get(city, "Unknown")

def get_lpi(city):
    country = get_country(city)
    return LPI_COUNTRY.get(country, 0.0)


# ============================================================
# 3. DISTANCES (DIRECT, REVERSE, HAVERSINE)
# ============================================================

def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    )
    return 6371 * 2 * math.asin(math.sqrt(a))

def get_distance(city_from: str, city_to: str) -> float:
    if city_to == "NoHub" or not city_to:
        return 0.0

    # Direct match
    mask = (df_distances["city_from_name"] == city_from) & (df_distances["city_to_name"] == city_to)
    row = df_distances.loc[mask]
    if not row.empty:
        return float(row["distance"].iloc[0])

    # Reverse match
    mask_rev = (df_distances["city_from_name"] == city_to) & (df_distances["city_to_name"] == city_from)
    row_rev = df_distances.loc[mask_rev]
    if not row_rev.empty:
        return float(row_rev["distance"].iloc[0])

    # If nothing found
    return 0.0


# ============================================================
# 4. WEIGHT CLASS + RISK CATEGORY
# ============================================================

# Your true quartile bins from training:
weight_bins = [136, 901, 1243, 1639, 2876]
weight_labels = ["W1_Light", "W2_Medium", "W3_Heavy", "W4_ExtraHeavy"]

def compute_weight_class(weight):
    for i in range(len(weight_bins) - 1):
        if weight_bins[i] <= weight <= weight_bins[i + 1]:
            return weight_labels[i]
    return weight_labels[-1]


# ============================================================
# 5. FEATURE ENGINEERING PIPELINE
# ============================================================

FEATURE_COLS = [
    'origin_port', '3pl', 'customs_procedures', 'logistic_hub', 'customer',
    'units', 'weight', 'material_handling', 'dist_port_to_hub', 'distance',
    'total_weight', 'has_hub', 'route_geo', 'route_x_upl', 'weight_class',
    'weight_risk_category', '3pl_customs', 'origin_lpi', 'customer_lpi',
    'hub_lpi', 'origin_port_dist_port_to_hub_std',
    'origin_port_TO_logistic_hub', 'logistic_hub_TO_city_customer',
    'FULL_origin_port_TO_logistic_hub_TO_city_customer',
    'zscore_weightbyorigin_port_TO_logistic_hub',
    'zscore_unitsbyorigin_port_TO_logistic_hub',
    'zscore_weightbylogistic_hub_TO_city_customer',
    'zscore_unitsbylogistic_hub_TO_city_customer',
    'zscore_weightbyFULL_origin_port_TO_logistic_hub_TO_city_customer',
    'zscore_unitsbyFULL_origin_port_TO_logistic_hub_TO_city_customer'
]

def build_feature_row(origin_port, threepl, customs, hub, customer, units, weight, material_handling):
    dist_port_to_hub = get_distance(origin_port, hub)

    if hub == "NoHub":
        distance = get_distance(origin_port, customer)
        dist_hub_to_customer = 0
    else:
        dist_hub_to_customer = get_distance(hub, customer)
        distance = dist_port_to_hub + dist_hub_to_customer

    total_weight = units * weight
    has_hub = 0 if hub == "NoHub" else 1

    route_geo = f"{origin_port}__{hub}__{customer}"
    route_x_upl = f"{route_geo}_{threepl}"

    weight_class = compute_weight_class(weight)
    weight_risk_category = f"{weight_class}_{material_handling}"
    threepl_customs = f"{threepl}_{customs}"

    origin_lpi = get_lpi(origin_port)
    customer_lpi = get_lpi(customer)
    hub_lpi = get_lpi(hub)

    origin_port_TO_logistic_hub = f"{origin_port}__{hub}"
    logistic_hub_TO_city_customer = f"{hub}__{customer}"
    full_route = f"{origin_port}__{hub}__{customer}"

    zscores = {
        "zscore_weightbyorigin_port_TO_logistic_hub": 0.0,
        "zscore_unitsbyorigin_port_TO_logistic_hub": 0.0,
        "zscore_weightbylogistic_hub_TO_city_customer": 0.0,
        "zscore_unitsbylogistic_hub_TO_city_customer": 0.0,
        "zscore_weightbyFULL_origin_port_TO_logistic_hub_TO_city_customer": 0.0,
        "zscore_unitsbyFULL_origin_port_TO_logistic_hub_TO_city_customer": 0.0,
    }

    row = {
        "origin_port": origin_port,
        "3pl": threepl,
        "customs_procedures": customs,
        "logistic_hub": hub,
        "customer": customer,
        "units": units,
        "weight": weight,
        "material_handling": material_handling,
        "dist_port_to_hub": dist_port_to_hub,
        "distance": distance,
        "total_weight": total_weight,
        "has_hub": has_hub,
        "route_geo": route_geo,
        "route_x_upl": route_x_upl,
        "weight_class": weight_class,
        "weight_risk_category": weight_risk_category,
        "3pl_customs": threepl_customs,
        "origin_lpi": origin_lpi,
        "customer_lpi": customer_lpi,
        "hub_lpi": hub_lpi,
        "origin_port_dist_port_to_hub_std": 0.0,
        "origin_port_TO_logistic_hub": origin_port_TO_logistic_hub,
        "logistic_hub_TO_city_customer": logistic_hub_TO_city_customer,
        "FULL_origin_port_TO_logistic_hub_TO_city_customer": full_route,
        **zscores
    }

    return pd.DataFrame([[row[c] for c in FEATURE_COLS]], columns=FEATURE_COLS)


# ============================================================
# 6. STREAMLIT UI
# ============================================================

st.title("üöö Late Order Prediction App (CatBoost Model)")

st.subheader("Enter shipment details:")

origin = st.selectbox("Origin Port:", sorted(CITY_TO_COUNTRY.keys()))
hub = st.selectbox("Logistic Hub:", ["NoHub"] + sorted(CITY_TO_COUNTRY.keys()))
customer = st.selectbox("Customer City:", sorted(CITY_TO_COUNTRY.keys()))

threepl = st.selectbox("3PL Provider:", ["DHL", "UPS", "DB_Schenker", "KuehneNagel", "Other"])
customs = st.selectbox("Customs Procedure:", ["Standard", "Express", "HighRisk", "LowRisk"])
material_handling = st.selectbox("Material Handling Type:", ["Standard", "Fragile", "Bulk", "Hazardous"])

units = st.number_input("Units:", min_value=1, value=10)
weight = st.number_input("Weight per Unit:", min_value=1.0, value=50.0)




if st.button("Predict Late Order"):
    X = build_feature_row(origin, threepl, customs, hub, customer, units, weight, material_handling)

    prob = float(model.predict_proba(X)[0][1])
    pred = int(prob >= 0.5)

    st.write("### Results:")
    st.write(f"**Probability of Delay:** {prob:.3f}")
    st.write(f"**Prediction (1=Late, 0=On-Time):** {pred}")

    if pred == 1:
        st.error("‚ö†Ô∏è Shipment is likely to be LATE.")
    else:
        st.success("‚úÖ Shipment is likely to be ON TIME.")


