In [7]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy.stats import chi2

#input_data
PATH = r"C:\Users\bhara\Documents\Airlink project\MLRData.xlsx"
df = pd.read_excel(PATH)

TARGET = "Aircraft_Type"

PREDICTORS = [
    "AW",
    "Cost",
    "Distance",
]

#needed_columns
needed = [TARGET] + PREDICTORS
missing_cols = [c for c in needed if c not in df.columns]
if missing_cols:
    raise ValueError(
        f"These columns are not in the file: {missing_cols}\n"
        f"Available columns: {df.columns.tolist()}"
    )

data = df[needed].copy().reset_index(drop=True)

#clean_target
data[TARGET] = data[TARGET].astype("object")
data = data[data[TARGET].notna()]
data = data[data[TARGET].astype(str).str.strip() != ""]
data = data.reset_index(drop=True)

#clean_predictors
for col in PREDICTORS:
    data[col] = pd.to_numeric(data[col], errors="coerce")
    med = data[col].median()
    if np.isnan(med):
        med = 0.0
    data[col] = data[col].fillna(med)

#encoding_target_variables
data["y_code"] = data[TARGET].astype("category").cat.codes
classes = list(data[TARGET].astype("category").cat.categories)

print("Classes (code -> label):", dict(enumerate(classes)))
print("N rows used:", len(data))

#formula_builder
def build_formula(predictors):
    if len(predictors) == 0:
        return "y_code ~ 1"
    rhs = " + ".join(predictors)
    return f"y_code ~ {rhs}"

#MNLR_and_AIC_BIC
rows = []
prev_res = None

for k in range(0, len(PREDICTORS) + 1):
    current_preds = PREDICTORS[:k]
    formula = build_formula(current_preds)

    res = smf.mnlogit(
        formula,
        data=data
    ).fit(
        method="newton",
        maxiter=200,
        disp=False
    )

    llf = res.llf
    n = int(res.nobs)
    k_params = int(res.params.size)

    aic = -2 * llf + 2 * k_params
    bic = -2 * llf + np.log(n) * k_params

    row = {
        "step": k,
        "added": None if k == 0 else PREDICTORS[k-1],
        "logLik": float(llf),
        "AIC": float(aic),
        "BIC": float(bic),
        "k_params": k_params,
        "LR_stat_vs_prev": np.nan,
        "LR_df": np.nan,
        "LR_pvalue": np.nan,
    }

    if prev_res is not None:
        LR = 2 * (res.llf - prev_res.llf)
        df_diff = int(res.params.size - prev_res.params.size)
        pval = chi2.sf(LR, df_diff) if df_diff > 0 else np.nan

        row.update({
            "LR_stat_vs_prev": float(LR),
            "LR_df": float(df_diff),
            "LR_pvalue": float(pval),
        })

    rows.append(row)
    prev_res = res

results = pd.DataFrame(rows)
results["delta_logLik"] = results["logLik"].diff()
results["delta_AIC"] = results["AIC"].diff()
results["delta_BIC"] = results["BIC"].diff()

print("\n=== Stepwise Multinomial Logit Results ===")
print(results[[
    "step",
    "added",
    "logLik",
    "delta_logLik",
    "AIC",
    "delta_AIC",
    "BIC",
    "delta_BIC",
    "LR_stat_vs_prev",
    "LR_df",
    "LR_pvalue"
]].to_string(index=False))

#final_model
final_formula = build_formula(PREDICTORS)
final_res = smf.mnlogit(
    final_formula,
    data=data
).fit(
    method="newton",
    maxiter=200,
    disp=False
)

print("\n=== Final Model Summary ===")
print(final_res.summary())

#predicted_probabilities
probs = final_res.predict(data[PREDICTORS])
probs_df = pd.DataFrame(probs, columns=classes)

print("\nPredicted probabilities (first 5 rows):")
print(probs_df.head())


Classes (code -> label): {0: 'Freighter', 1: 'Multimodal', 2: 'Narrowbody', 3: 'Ocean Freight', 4: 'Trucking', 5: 'Widebody'}
N rows used: 582

=== Stepwise Multinomial Logit Results ===
 step    added      logLik  delta_logLik         AIC   delta_AIC         BIC  delta_BIC  LR_stat_vs_prev  LR_df    LR_pvalue
    0     None -546.086759           NaN 1102.173518         NaN 1124.005870        NaN              NaN    NaN          NaN
    1       AW -523.275682     22.811077 1066.551363  -35.622155 1110.216068 -13.789802        45.622155    5.0 1.084153e-08
    2     Cost -513.954884      9.320798 1057.909768   -8.641595 1123.406825  13.190757        18.641595    5.0 2.241023e-03
    3 Distance -452.048364     61.906519  944.096729 -113.813039 1031.426138 -91.980687       123.813039    5.0 4.884143e-25

=== Final Model Summary ===
                          MNLogit Regression Results                          
Dep. Variable:                 y_code   No. Observations:                  582
M