In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from scipy import stats
from scipy.stats import chi2
import warnings

import statsmodels.api as sm
import statsmodels.formula.api as smf

from mord import LogisticAT
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [7]:
clean_job_sat = pd.read_csv("clean_job_sat.csv")

In [8]:
# Linear regression with weights
linear_model = smf.wls("stfmjob_grouped ~ happy + inprdsc + health + hlthhmp + rlgdgr + \
    brncntr + gndr + agea + rshpsts + edulvlb + eduyrs + uempla + uempli + rtrd + hswrk + \
    emplrel + wrkctra + estsz + wkdcorga + wkhtot + tporgwk + uemp3m + hinctnta + atncrse + \
    trdawrk + jbprtfp + pfmfdjba + dcsfwrka + nacer2 + domicil + hincsrca + emprelp",
    data=clean_job_sat, weights=clean_job_sat["anweight"])

linear_results = linear_model.fit()
print(linear_results.summary())

                            WLS Regression Results                            
Dep. Variable:        stfmjob_grouped   R-squared:                       0.272
Model:                            WLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                     13.81
Date:                Sat, 31 May 2025   Prob (F-statistic):          5.09e-127
Time:                        14:56:43   Log-Likelihood:                -3706.1
No. Observations:                2580   AIC:                             7550.
Df Residuals:                    2511   BIC:                             7954.
Df Model:                          68                                         
Covariance Type:            nonrobust                                         
                                                                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

In [5]:
# Prepare X and y for ordered logit model
X = clean_job_sat[[
    'happy', 'inprdsc', 'health', 'hlthhmp', 'rlgdgr', 'brncntr', 'gndr',
    'agea', 'rshpsts', 'edulvlb', 'eduyrs', 'uempla', 'uempli', 'rtrd', 'hswrk',
    'emplrel', 'wrkctra', 'estsz', 'wkdcorga', 'wkhtot', 'tporgwk', 'uemp3m',
    'hinctnta', 'atncrse', 'trdawrk', 'jbprtfp', 'pfmfdjba', 'dcsfwrka', 'nacer2',
    'domicil', 'hincsrca', 'emprelp'
]]
y = clean_job_sat['stfmjob_grouped'].astype(int)

# Fill NaNs in object columns with 'Missing', numeric columns with median
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].fillna('Missing')
    else:
        X[col] = X[col].fillna(X[col].median())

# Encode categorical variables
cat_cols = X.select_dtypes(include='object').columns.tolist() + X.select_dtypes(include='category').columns.tolist()
ct = ColumnTransformer([
    ("cat", OrdinalEncoder(), cat_cols)
], remainder='passthrough')

# Fit ordinal logistic model
model_ordlogit = make_pipeline(ct, LogisticAT(alpha=1.0))
model_ordlogit.fit(X, y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(X[col].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna(X[col].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna('Missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [25]:
# Select features and response
X_ord = clean_job_sat[[
    'happy', 'inprdsc', 'health', 'hlthhmp', 'rlgdgr', 'brncntr', 'gndr',
    'agea', 'rshpsts', 'edulvlb', 'eduyrs', 'uempla', 'uempli', 'rtrd', 'hswrk',
    'emplrel', 'wrkctra', 'estsz', 'wkdcorga', 'wkhtot', 'tporgwk', 'uemp3m',
    'hinctnta', 'atncrse', 'trdawrk', 'jbprtfp', 'pfmfdjba', 'dcsfwrka', 'nacer2',
    'domicil', 'hincsrca', 'emprelp'
]]
y_ord = clean_job_sat['stfmjob_grouped'].astype(int)

# Encode categorical variables
X_encoded = pd.get_dummies(X_ord, drop_first=True)

# Convert all boolean columns to int (0/1) for statsmodels compatibility
X_encoded = X_encoded.astype(float)

# Fit Ordered Probit model
oprobit_model = OrderedModel(
    endog=y_ord,
    exog=X_encoded,
    distr='probit'  # change to 'logit' for ordered logit
)

oprobit_result = oprobit_model.fit(method='bfgs')
print(oprobit_result.summary())

Optimization terminated successfully.
         Current function value: 1.187168
         Iterations: 263
         Function evaluations: 269
         Gradient evaluations: 269
                             OrderedModel Results                             
Dep. Variable:        stfmjob_grouped   Log-Likelihood:                -3186.4
Model:                   OrderedModel   AIC:                             6517.
Method:            Maximum Likelihood   BIC:                             6941.
Date:                Sat, 31 May 2025                                         
Time:                        14:49:06                                         
No. Observations:                2684                                         
Df Residuals:                    2612                                         
Df Model:                          72                                         
                                                               coef    std err          z      P>|z|      [0.025  

In [9]:

warnings.filterwarnings("ignore")

# --- Helper: Fit ordered logit model
def fit_ordered_logit(y, X, distr='logit'):
    X_encoded = pd.get_dummies(X, drop_first=True)
    # Ensure all columns are numeric (float)
    X_encoded = X_encoded.astype(float)
    model = OrderedModel(endog=y, exog=X_encoded, distr=distr)
    result = model.fit(method='lbfgs', maxiter=100, disp=False)
    return result, X_encoded

# ---  Likelihood ratio test
def likelihood_ratio_test(model_restricted, model_full):
    lr_stat = 2 * (model_full.llf - model_restricted.llf)  # should always be ≥ 0
    df_diff = model_full.df_model - model_restricted.df_model
    p_value = chi2.sf(lr_stat, df_diff)
    return lr_stat, df_diff, p_value




# --- Main function: Stepwise ordered logit with diagnostics
def general_to_specific_diagnostics(y, X, drop_sequence, distr='logit', max_steps=None):
    current_vars = list(X.columns)
    results = []
    full_vars = list(X.columns)
    # Fit full model
    full_model, full_encoded = fit_ordered_logit(y, X[full_vars], distr)
    current_model = full_model
    current_encoded = full_encoded

    for step, var_to_drop in enumerate(drop_sequence, 1):
        if max_steps and step > max_steps:
            break
        if var_to_drop not in current_vars:
            continue

        reduced_vars = [v for v in current_vars if v != var_to_drop]
        reduced_model, reduced_encoded = fit_ordered_logit(y, X[reduced_vars], distr)

        lr_stat, df_diff, p_val = likelihood_ratio_test(reduced_model, full_model)
        step_result = {
            "Step": step,
            "Dropped": var_to_drop,
            "LR stat": round(lr_stat, 3),
            "df diff": df_diff,
            "LR p-value": round(p_val, 4),
            "AIC": round(reduced_model.aic, 2),
            "Accepted": p_val > 0.05
        }
        print(step_result)
        print("  Full model:     stfmjob_grouped ~", " + ".join(full_vars))
        print("  Reduced model:  stfmjob_grouped ~", " + ".join(reduced_vars))
        results.append(step_result)

        # if p_val > 0.05:
        current_vars = reduced_vars
        current_model = reduced_model
        #   current_encoded = reduced_encoded

    return pd.DataFrame(results), current_model, current_vars
    # Exclude specified variables from X_full and X_restricted if present

# --- Example usage
if __name__ == "__main__":
    # Load your dataset
    df = pd.read_csv("clean_job_sat.csv")  # Replace with your file

    # Convert types
    for col in df.columns:
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)
        elif df[col].dtype == 'object':
            df[col] = df[col].astype(str)

    # Remove unwanted predictors
    drop_columns = ['cntry', 'anweight', 'stfmjob', 'stfmjob_named']
    df = df.drop(columns=[col for col in drop_columns if col in df.columns])

    # Define response and predictors
    y = df['stfmjob_grouped'].astype(int)
    X = df.drop(columns=['stfmjob_grouped'])

    drop_order = [
        'rtrd', 'hswrk', 'uemp3m', 'wrkctra', 'edulvlb', 'hlthhmp', 'hincsrca', 'nacer2',
        'estsz', 'dcsfwrka', 'agea', 'inprdsc', 'health', 'brncntr', 'uempli', 'emplrel',
        'gndr', 'rshpsts', 'eduyrs', 'hinctnta'
    ]

    results_table, final_model, final_vars = general_to_specific_diagnostics(y, X, drop_order)

    # Save or display results
    # results_table.to_csv("stepwise_results.csv", index=False)
    print(results_table)
    # print("\nFinal model summary:\n")
    # print(final_model.summary())

{'Step': 1, 'Dropped': 'rtrd', 'LR stat': -64.934, 'df diff': 1, 'LR p-value': 1.0, 'AIC': 6509.21, 'Accepted': True}
  Full model:     stfmjob_grouped ~ happy + inprdsc + health + hlthhmp + rlgdgr + brncntr + gndr + agea + rshpsts + domicil + edulvlb + eduyrs + uempla + uempli + rtrd + hswrk + emplrel + wrkctra + estsz + wkdcorga + wkhtot + nacer2 + tporgwk + uemp3m + hincsrca + hinctnta + emprelp + atncrse + trdawrk + jbprtfp + pfmfdjba + dcsfwrka
  Reduced model:  stfmjob_grouped ~ happy + inprdsc + health + hlthhmp + rlgdgr + brncntr + gndr + agea + rshpsts + domicil + edulvlb + eduyrs + uempla + uempli + hswrk + emplrel + wrkctra + estsz + wkdcorga + wkhtot + nacer2 + tporgwk + uemp3m + hincsrca + hinctnta + emprelp + atncrse + trdawrk + jbprtfp + pfmfdjba + dcsfwrka
{'Step': 2, 'Dropped': 'hswrk', 'LR stat': -12.021, 'df diff': 2, 'LR p-value': 1.0, 'AIC': 6560.12, 'Accepted': True}
  Full model:     stfmjob_grouped ~ happy + inprdsc + health + hlthhmp + rlgdgr + brncntr + gndr +

In [11]:
print(results_table)
# print("\nFinal model summary:\n")
# print(final_model.summary())


    Step   Dropped  LR stat  df diff  LR p-value      AIC  Accepted
0      1      rtrd  -64.934        1      1.0000  6509.21      True
1      2     hswrk  -12.021        2      1.0000  6560.12      True
2      3    uemp3m    0.227        3      0.9731  6570.37      True
3      4   wrkctra   -3.492        5      1.0000  6562.65      True
4      5   edulvlb    9.029        9      0.4346  6567.17      True
5      6   hlthhmp   12.742       10      0.2384  6568.89      True
6      7  hincsrca   11.402       17      0.8349  6553.54      True
7      8    nacer2   10.509       20      0.9580  6546.65      True
8      9     estsz   29.888       21      0.0943  6564.03      True
9     10  dcsfwrka   18.652       23      0.7212  6548.80      True
10    11      agea  -68.978       24      1.0000  6459.16      True
11    12   inprdsc  -45.096       25      1.0000  6481.05      True
12    13    health  -53.368       29      1.0000  6464.78      True
13    14   brncntr  -26.548       30      1.0000

In [12]:
# Reconstruct the final set of variables with interaction
final_vars_with_interaction = final_vars + ['happy:gndr'] if 'happy' in final_vars and 'gndr' in final_vars else final_vars

# Convert interaction manually if needed
if 'happy:gndr' in final_vars_with_interaction:
    df['happy:gndr'] = df['happy'] * df['gndr']

# Refit the final model with interaction
final_model_with_interaction, X_encoded_interaction = fit_ordered_logit(y, df[final_vars_with_interaction], distr='logit')

# Summary
print(final_model_with_interaction.summary())

                             OrderedModel Results                             
Dep. Variable:        stfmjob_grouped   Log-Likelihood:                -3197.7
Model:                   OrderedModel   AIC:                             6461.
Method:            Maximum Likelihood   BIC:                             6656.
Date:                Sat, 31 May 2025                                         
Time:                        14:59:57                                         
No. Observations:                2684                                         
Df Residuals:                    2651                                         
Df Model:                          33                                         
                                                             coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------
happy                                      

In [15]:

# Choose a variable to compute marginal effects for
var_name = 'happy'

# Check it's in the encoded DataFrame
if var_name not in X_encoded_interaction.columns:
    raise ValueError(f"{var_name} not found in encoded predictors")

# Define a small perturbation
delta = 1e-5

# Create modified datasets
X_up = X_encoded_interaction.copy()
X_down = X_encoded_interaction.copy()
X_up[var_name] += delta
X_down[var_name] -= delta

# Ensure inputs are numpy arrays for compatibility with predict
X_up_np = X_up.to_numpy()
X_down_np = X_down.to_numpy()

# Predict probabilities
probs_up = final_model_with_interaction.model.predict(final_model_with_interaction.params, exog=X_up_np)
probs_down = final_model_with_interaction.model.predict(final_model_with_interaction.params, exog=X_down_np)

# Estimate marginal effects using central difference
marginal_effects = (probs_up - probs_down) / (2 * delta)

# Compute average marginal effect per outcome category
average_mfx = marginal_effects.mean(axis=0)
marg_df = pd.DataFrame({
    "Outcome Category": list(range(1, len(average_mfx)+1)),
    f"AME of '{var_name}'": average_mfx.round(4)
})

print(marg_df)

   Outcome Category  AME of 'happy'
0                 1         -0.0173
1                 2         -0.0156
2                 3         -0.0257
3                 4         -0.0020
4                 5          0.0607


In [16]:
def compute_ame_table(model, X_encoded, var_list=None, delta=1e-5):
    if var_list is None:
        var_list = X_encoded.columns.tolist()

    ame_table = []

    for var in var_list:
        if var not in X_encoded.columns:
            continue

        # Create perturbed datasets
        X_up = X_encoded.copy()
        X_down = X_encoded.copy()
        X_up[var] += delta
        X_down[var] -= delta

        # Predict probabilities (as NumPy arrays)
        probs_up = model.model.predict(model.params, exog=X_up.to_numpy())
        probs_down = model.model.predict(model.params, exog=X_down.to_numpy())

        # Marginal effects via central difference
        marginal_effects = (probs_up - probs_down) / (2 * delta)
        average_effect = marginal_effects.mean(axis=0)

        # Store as row: var name + AME per outcome category
        ame_table.append([var] + list(average_effect))

    # Format as DataFrame
    columns = ['Variable'] + [str(i) for i in range(1, probs_up.shape[1] + 1)]
    ame_df = pd.DataFrame(ame_table, columns=columns)
    ame_df.iloc[:, 1:] = ame_df.iloc[:, 1:].round(3)  # round AMEs

    return ame_df

In [17]:
ame_df = compute_ame_table(final_model_with_interaction, X_encoded_interaction)
print(ame_df.to_string(index=False))

                                              Variable      1      2      3      4      5
                                                 happy -0.017 -0.016 -0.026 -0.002  0.061
                                                rlgdgr -0.002 -0.001 -0.002 -0.000  0.005
                                                uempla  0.012  0.011  0.018  0.001 -0.042
                                              wkdcorga -0.008 -0.007 -0.011 -0.001  0.026
                                                wkhtot -0.001 -0.001 -0.001 -0.000  0.003
                                               atncrse -0.018 -0.016 -0.027 -0.002  0.064
                               domicil_Country village -0.014 -0.013 -0.021 -0.002  0.049
                   domicil_Farm or home in countryside -0.005 -0.005 -0.008 -0.001  0.018
              domicil_Suburbs or outskirts of big city -0.012 -0.011 -0.018 -0.001  0.042
                            domicil_Town or small city -0.022 -0.020 -0.032 -0.003  0.076
          

In [20]:


# Prepare design matrix
X_design = pd.get_dummies(df[final_vars_with_interaction], drop_first=True).astype(float)
X_design = sm.add_constant(X_design)

vif_df = pd.DataFrame()
vif_df["feature"] = X_design.columns
vif_df["VIF"] = [variance_inflation_factor(X_design.values, i) for i in range(X_design.shape[1])]
print(vif_df)

                                              feature         VIF
0                                               const  157.607061
1                                               happy    1.107650
2                                              rlgdgr    1.059090
3                                              uempla    1.007927
4                                            wkdcorga    1.086915
5                                              wkhtot    1.098639
6                                             atncrse    1.091998
7                             domicil_Country village    1.812550
8                 domicil_Farm or home in countryside    1.176501
9            domicil_Suburbs or outskirts of big city    1.506603
10                         domicil_Town or small city    1.770515
11                   tporgwk_A state owned enterprise    1.045406
12                tporgwk_Central or local government    1.087411
13  tporgwk_Other public sector (ex. education and...    1.126147
14        